Bitcoin ABC  0.29.2
P2P Digital Currency
nanobench.h
Go to the documentation of this file.
1 // __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _
2 // | \ | |_____| | \ | | | |_____] |______ | \ | | |_____|
3 // | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | |
4 //
5 // Microbenchmark framework for C++11/14/17/20
6 // https://github.com/martinus/nanobench
7 //
8 // Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9 // SPDX-License-Identifier: MIT
10 // Copyright (c) 2019-2021 Martin Ankerl <[email protected]>
11 //
12 // Permission is hereby granted, free of charge, to any person obtaining a copy
13 // of this software and associated documentation files (the "Software"), to deal
14 // in the Software without restriction, including without limitation the rights
15 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 // copies of the Software, and to permit persons to whom the Software is
17 // furnished to do so, subject to the following conditions:
18 //
19 // The above copyright notice and this permission notice shall be included in all
20 // copies or substantial portions of the Software.
21 //
22 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 // SOFTWARE.
29 
30 #ifndef ANKERL_NANOBENCH_H_INCLUDED
31 #define ANKERL_NANOBENCH_H_INCLUDED
32 
33 // see https://semver.org/
34 #define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35 #define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes
36 #define ANKERL_NANOBENCH_VERSION_PATCH 6 // backwards-compatible bug fixes
37 
39 // public facing api - as minimal as possible
41 
42 #include <chrono> // high_resolution_clock
43 #include <cstring> // memcpy
44 #include <iosfwd> // for std::ostream* custom output target in Config
45 #include <string> // all names
46 #include <vector> // holds all results
47 
48 #define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
49 
50 #define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
51 #define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
52 #define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
53 #define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
54 #define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
55 
56 #if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
57 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
58 #else
59 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
60 #endif
61 
62 #if defined(__clang__)
63 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
64  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
65 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
66 #else
67 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
68 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
69 #endif
70 
71 #if defined(__GNUC__)
72 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
73 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
74 #else
75 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
76 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
77 #endif
78 
79 #if defined(ANKERL_NANOBENCH_LOG_ENABLED)
80 # include <iostream>
81 # define ANKERL_NANOBENCH_LOG(x) \
82  do { \
83  std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
84  } while (0)
85 #else
86 # define ANKERL_NANOBENCH_LOG(x) \
87  do { \
88  } while (0)
89 #endif
90 
91 #define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
92 #if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
93 # include <linux/version.h>
94 # if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0)
95 // PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3
96 // PERF_FLAG_FD_CLOEXEC since kernel 3.14
97 # undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
98 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
99 # endif
100 #endif
101 
102 #if defined(__clang__)
103 # define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
104 #else
105 # define ANKERL_NANOBENCH_NO_SANITIZE(...)
106 #endif
107 
108 #if defined(_MSC_VER)
109 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
110 #else
111 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
112 #endif
113 
114 // workaround missing "is_trivially_copyable" in g++ < 5.0
115 // See https://stackoverflow.com/a/31798726/48181
116 #if defined(__GNUC__) && __GNUC__ < 5
117 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
118 #else
119 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
120 #endif
121 
122 // declarations ///////////////////////////////////////////////////////////////////////////////////
123 
124 namespace ankerl {
125 namespace nanobench {
126 
127 using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
128  std::chrono::steady_clock>::type;
129 class Bench;
130 struct Config;
131 class Result;
132 class Rng;
133 class BigO;
134 
283 void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
284 void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
285 
294 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
295 void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
296 
297 // Contains mustache-like templates
298 namespace templates {
299 
309 char const* csv() noexcept;
310 
321 char const* htmlBoxplot() noexcept;
322 
329 char const* pyperf() noexcept;
330 
340 char const* json() noexcept;
341 
342 } // namespace templates
343 
344 namespace detail {
345 
346 template <typename T>
347 struct PerfCountSet;
348 
349 class IterationLogic;
350 class PerformanceCounters;
351 
352 #if ANKERL_NANOBENCH(PERF_COUNTERS)
353 class LinuxPerformanceCounters;
354 #endif
355 
356 } // namespace detail
357 } // namespace nanobench
358 } // namespace ankerl
359 
360 // definitions ////////////////////////////////////////////////////////////////////////////////////
361 
362 namespace ankerl {
363 namespace nanobench {
364 namespace detail {
365 
366 template <typename T>
367 struct PerfCountSet {
374 };
375 
376 } // namespace detail
377 
378 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
379 struct Config {
380  // actual benchmark config
381  std::string mBenchmarkTitle = "benchmark";
382  std::string mBenchmarkName = "noname";
383  std::string mUnit = "op";
384  double mBatch = 1.0;
385  double mComplexityN = -1.0;
386  size_t mNumEpochs = 11;
387  size_t mClockResolutionMultiple = static_cast<size_t>(1000);
388  std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
389  std::chrono::nanoseconds mMinEpochTime{};
390  uint64_t mMinEpochIterations{1};
391  uint64_t mEpochIterations{0}; // If not 0, run *exactly* these number of iterations per epoch.
392  uint64_t mWarmup = 0;
393  std::ostream* mOut = nullptr;
394  std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1};
395  std::string mTimeUnitName = "ns";
396  bool mShowPerformanceCounters = true;
397  bool mIsRelative = false;
398 
403  Config(Config const&);
404  Config(Config&&) noexcept;
405 };
406 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
407 
408 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
409 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
410 class Result {
411 public:
412  enum class Measure : size_t {
413  elapsed,
414  iterations,
415  pagefaults,
416  cpucycles,
417  contextswitches,
418  instructions,
419  branchinstructions,
420  branchmisses,
421  _size
422  };
423 
424  explicit Result(Config const& benchmarkConfig);
425 
429  Result(Result const&);
430  Result(Result&&) noexcept;
431 
432  // adds new measurement results
433  // all values are scaled by iters (except iters...)
434  void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
435 
436  ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
437 
438  ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
439  ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
440  ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
441  ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
442  ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
443  ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
444  ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
445 
446  ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
447  ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
448  ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
449  ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
450 
451  // Finds string, if not found, returns _size.
452  static Measure fromString(std::string const& str);
453 
454 private:
455  Config mConfig{};
456  std::vector<std::vector<double>> mNameToMeasurements{};
457 };
458 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
459 
460 
477 class Rng final {
478 public:
482  using result_type = uint64_t;
483 
484  static constexpr uint64_t(min)();
485  static constexpr uint64_t(max)();
486 
492  Rng(Rng const&) = delete;
493 
497  Rng& operator=(Rng const&) = delete;
498 
499  // moving is ok
500  Rng(Rng&&) noexcept = default;
501  Rng& operator=(Rng&&) noexcept = default;
502  ~Rng() noexcept = default;
503 
511  Rng();
512 
529  explicit Rng(uint64_t seed) noexcept;
530  Rng(uint64_t x, uint64_t y) noexcept;
531  Rng(std::vector<uint64_t> const& data);
532 
536  ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
537 
545  inline uint64_t operator()() noexcept;
546 
547  // This is slightly biased. See
548 
563  inline uint32_t bounded(uint32_t range) noexcept;
564 
565  // random double in range [0, 1(
566  // see http://prng.di.unimi.it/
567 
574  inline double uniform01() noexcept;
575 
583  template <typename Container>
584  void shuffle(Container& container) noexcept;
585 
592  std::vector<uint64_t> state() const;
593 
594 private:
595  static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
596 
597  uint64_t mX;
598  uint64_t mY;
599 };
600 
615 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
616 class Bench {
617 public:
621  Bench();
622 
623  Bench(Bench&& other);
624  Bench& operator=(Bench&& other);
625  Bench(Bench const& other);
626  Bench& operator=(Bench const& other);
627  ~Bench() noexcept;
628 
647  template <typename Op>
648  ANKERL_NANOBENCH(NOINLINE)
649  Bench& run(char const* benchmarkName, Op&& op);
650 
651  template <typename Op>
652  ANKERL_NANOBENCH(NOINLINE)
653  Bench& run(std::string const& benchmarkName, Op&& op);
654 
659  template <typename Op>
660  ANKERL_NANOBENCH(NOINLINE)
661  Bench& run(Op&& op);
662 
668  Bench& title(char const* benchmarkTitle);
669  Bench& title(std::string const& benchmarkTitle);
670  ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
671 
673  Bench& name(char const* benchmarkName);
674  Bench& name(std::string const& benchmarkName);
675  ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
676 
686  template <typename T>
687  Bench& batch(T b) noexcept;
688  ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
689 
698  Bench& unit(char const* unit);
699  Bench& unit(std::string const& unit);
700  ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
701 
711  Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
712  ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
713  ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
714 
722  Bench& output(std::ostream* outstream) noexcept;
723  ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
724 
745  Bench& clockResolutionMultiple(size_t multiple) noexcept;
746  ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
747 
763  Bench& epochs(size_t numEpochs) noexcept;
764  ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
765 
776  Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
777  ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
778 
789  Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
790  ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
791 
802  Bench& minEpochIterations(uint64_t numIters) noexcept;
803  ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
804 
811  Bench& epochIterations(uint64_t numIters) noexcept;
812  ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
813 
823  Bench& warmup(uint64_t numWarmupIters) noexcept;
824  ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
825 
843  Bench& relative(bool isRelativeEnabled) noexcept;
844  ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
845 
854  Bench& performanceCounters(bool showPerformanceCounters) noexcept;
855  ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
856 
865  ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
866 
874  template <typename Arg>
876 
891  template <typename T>
892  Bench& complexityN(T b) noexcept;
893  ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
894 
926  std::vector<BigO> complexityBigO() const;
927 
951  template <typename Op>
952  BigO complexityBigO(char const* name, Op op) const;
953 
954  template <typename Op>
955  BigO complexityBigO(std::string const& name, Op op) const;
956 
964  Bench& render(char const* templateContent, std::ostream& os);
965  Bench& render(std::string const& templateContent, std::ostream& os);
966 
967  Bench& config(Config const& benchmarkConfig);
968  ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
969 
970 private:
971  Config mConfig{};
972  std::vector<Result> mResults{};
973 };
974 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
975 
976 
982 template <typename Arg>
983 void doNotOptimizeAway(Arg&& arg);
984 
985 namespace detail {
986 
987 #if defined(_MSC_VER)
988 void doNotOptimizeAwaySink(void const*);
989 
990 template <typename T>
991 void doNotOptimizeAway(T const& val);
992 
993 #else
994 
995 // These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
996 // this seemd to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
997 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
998 template <typename T>
999 void doNotOptimizeAway(T const& val) {
1000  // NOLINTNEXTLINE(hicpp-no-assembler)
1001  asm volatile("" : : "r,m"(val) : "memory");
1002 }
1003 
1004 template <typename T>
1005 void doNotOptimizeAway(T& val) {
1006 # if defined(__clang__)
1007  // NOLINTNEXTLINE(hicpp-no-assembler)
1008  asm volatile("" : "+r,m"(val) : : "memory");
1009 # else
1010  // NOLINTNEXTLINE(hicpp-no-assembler)
1011  asm volatile("" : "+m,r"(val) : : "memory");
1012 # endif
1013 }
1014 #endif
1015 
1016 // internally used, but visible because run() is templated.
1017 // Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
1018 // having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
1019 ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
1021 public:
1022  explicit IterationLogic(Bench const& config) noexcept;
1024 
1025  ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
1026  void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
1027  void moveResultTo(std::vector<Result>& results) noexcept;
1028 
1029 private:
1030  struct Impl;
1031  Impl* mPimpl;
1032 };
1033 ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
1034 
1035 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1037 public:
1040 
1043 
1045  void endMeasure();
1046  void updateResults(uint64_t numIters);
1047 
1048  ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1049  ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1050 
1051 private:
1052 #if ANKERL_NANOBENCH(PERF_COUNTERS)
1053  LinuxPerformanceCounters* mPc = nullptr;
1054 #endif
1057 };
1058 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1059 
1060 // Gets the singleton
1062 
1063 } // namespace detail
1064 
1065 class BigO {
1066 public:
1067  using RangeMeasure = std::vector<std::pair<double, double>>;
1068 
1069  template <typename Op>
1071  for (auto& rangeMeasure : data) {
1072  rangeMeasure.first = op(rangeMeasure.first);
1073  }
1074  return data;
1075  }
1076 
1077  static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1078 
1079  template <typename Op>
1080  BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1081  : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1082 
1083  template <typename Op>
1084  BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1085  : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1086 
1087  BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1088  BigO(std::string const& bigOName, RangeMeasure const& scaledRangeMeasure);
1089  ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1090  ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1091  ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1092  ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1093 
1094 private:
1095  std::string mName{};
1096  double mConstant{};
1097  double mNormalizedRootMeanSquare{};
1098 };
1099 std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1100 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1101 
1102 } // namespace nanobench
1103 } // namespace ankerl
1104 
1105 // implementation /////////////////////////////////////////////////////////////////////////////////
1106 
1107 namespace ankerl {
1108 namespace nanobench {
1109 
1110 constexpr uint64_t(Rng::min)() {
1111  return 0;
1112 }
1113 
1114 constexpr uint64_t(Rng::max)() {
1115  return (std::numeric_limits<uint64_t>::max)();
1116 }
1117 
1118 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1119 uint64_t Rng::operator()() noexcept {
1120  auto x = mX;
1121 
1122  mX = UINT64_C(15241094284759029579) * mY;
1123  mY = rotl(mY - x, 27);
1124 
1125  return x;
1126 }
1127 
1128 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1129 uint32_t Rng::bounded(uint32_t range) noexcept {
1130  uint64_t r32 = static_cast<uint32_t>(operator()());
1131  auto multiresult = r32 * range;
1132  return static_cast<uint32_t>(multiresult >> 32U);
1133 }
1134 
1135 double Rng::uniform01() noexcept {
1136  auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1137  // can't use union in c++ here for type puning, it's undefined behavior.
1138  // std::memcpy is optimized anyways.
1139  double d;
1140  std::memcpy(&d, &i, sizeof(double));
1141  return d - 1.0;
1142 }
1143 
1144 template <typename Container>
1145 void Rng::shuffle(Container& container) noexcept {
1146  auto size = static_cast<uint32_t>(container.size());
1147  for (auto i = size; i > 1U; --i) {
1148  using std::swap;
1149  auto p = bounded(i); // number in [0, i)
1150  swap(container[i - 1], container[p]);
1151  }
1152 }
1153 
1154 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1155 constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1156  return (x << k) | (x >> (64U - k));
1157 }
1158 
1159 template <typename Op>
1161 Bench& Bench::run(Op&& op) {
1162  // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1163  detail::IterationLogic iterationLogic(*this);
1164  auto& pc = detail::performanceCounters();
1165 
1166  while (auto n = iterationLogic.numIters()) {
1167  pc.beginMeasure();
1168  Clock::time_point before = Clock::now();
1169  while (n-- > 0) {
1170  op();
1171  }
1172  Clock::time_point after = Clock::now();
1173  pc.endMeasure();
1174  pc.updateResults(iterationLogic.numIters());
1175  iterationLogic.add(after - before, pc);
1176  }
1177  iterationLogic.moveResultTo(mResults);
1178  return *this;
1179 }
1180 
1181 // Performs all evaluations.
1182 template <typename Op>
1183 Bench& Bench::run(char const* benchmarkName, Op&& op) {
1184  name(benchmarkName);
1185  return run(std::forward<Op>(op));
1186 }
1187 
1188 template <typename Op>
1189 Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1190  name(benchmarkName);
1191  return run(std::forward<Op>(op));
1192 }
1193 
1194 template <typename Op>
1195 BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1196  return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1197 }
1198 
1199 template <typename Op>
1200 BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1201  return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1202 }
1203 
1204 // Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1205 // Any argument is cast to double.
1206 template <typename T>
1207 Bench& Bench::batch(T b) noexcept {
1208  mConfig.mBatch = static_cast<double>(b);
1209  return *this;
1210 }
1211 
1212 // Sets the computation complexity of the next run. Any argument is cast to double.
1213 template <typename T>
1214 Bench& Bench::complexityN(T n) noexcept {
1215  mConfig.mComplexityN = static_cast<double>(n);
1216  return *this;
1217 }
1218 
1219 // Convenience: makes sure none of the given arguments are optimized away by the compiler.
1220 template <typename Arg>
1222  detail::doNotOptimizeAway(std::forward<Arg>(arg));
1223  return *this;
1224 }
1225 
1226 // Makes sure none of the given arguments are optimized away by the compiler.
1227 template <typename Arg>
1228 void doNotOptimizeAway(Arg&& arg) {
1229  detail::doNotOptimizeAway(std::forward<Arg>(arg));
1230 }
1231 
1232 namespace detail {
1233 
1234 #if defined(_MSC_VER)
1235 template <typename T>
1236 void doNotOptimizeAway(T const& val) {
1237  doNotOptimizeAwaySink(&val);
1238 }
1239 
1240 #endif
1241 
1242 } // namespace detail
1243 } // namespace nanobench
1244 } // namespace ankerl
1245 
1246 #if defined(ANKERL_NANOBENCH_IMPLEMENT)
1247 
1249 // implementation part - only visible in .cpp
1251 
1252 # include <algorithm> // sort, reverse
1253 # include <atomic> // compare_exchange_strong in loop overhead
1254 # include <cstdlib> // getenv
1255 # include <cstring> // strstr, strncmp
1256 # include <fstream> // ifstream to parse proc files
1257 # include <iomanip> // setw, setprecision
1258 # include <iostream> // cout
1259 # include <numeric> // accumulate
1260 # include <random> // random_device
1261 # include <sstream> // to_s in Number
1262 # include <stdexcept> // throw for rendering templates
1263 # include <tuple> // std::tie
1264 # if defined(__linux__)
1265 # include <unistd.h> //sysconf
1266 # endif
1267 # if ANKERL_NANOBENCH(PERF_COUNTERS)
1268 # include <map> // map
1269 
1270 # include <linux/perf_event.h>
1271 # include <sys/ioctl.h>
1272 # include <sys/syscall.h>
1273 # include <unistd.h>
1274 # endif
1275 
1276 // declarations ///////////////////////////////////////////////////////////////////////////////////
1277 
1278 namespace ankerl {
1279 namespace nanobench {
1280 
1281 // helper stuff that is only intended to be used internally
1282 namespace detail {
1283 
1284 struct TableInfo;
1285 
1286 // formatting utilities
1287 namespace fmt {
1288 
1289 class NumSep;
1290 class StreamStateRestorer;
1291 class Number;
1292 class MarkDownColumn;
1293 class MarkDownCode;
1294 
1295 } // namespace fmt
1296 } // namespace detail
1297 } // namespace nanobench
1298 } // namespace ankerl
1299 
1300 // definitions ////////////////////////////////////////////////////////////////////////////////////
1301 
1302 namespace ankerl {
1303 namespace nanobench {
1304 
1305 uint64_t splitMix64(uint64_t& state) noexcept;
1306 
1307 namespace detail {
1308 
1309 // helpers to get double values
1310 template <typename T>
1311 inline double d(T t) noexcept {
1312  return static_cast<double>(t);
1313 }
1314 inline double d(Clock::duration duration) noexcept {
1315  return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1316 }
1317 
1318 // Calculates clock resolution once, and remembers the result
1319 inline Clock::duration clockResolution() noexcept;
1320 
1321 } // namespace detail
1322 
1323 namespace templates {
1324 
1325 char const* csv() noexcept {
1326  return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1327 {{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1328 {{/result}})DELIM";
1329 }
1330 
1331 char const* htmlBoxplot() noexcept {
1332  return R"DELIM(<html>
1333 
1334 <head>
1335  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1336 </head>
1337 
1338 <body>
1339  <div id="myDiv"></div>
1340  <script>
1341  var data = [
1342  {{#result}}{
1343  name: '{{name}}',
1344  y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1345  },
1346  {{/result}}
1347  ];
1348  var title = '{{title}}';
1349 
1350  data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1351  var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1352  </script>
1353 </body>
1354 
1355 </html>)DELIM";
1356 }
1357 
1358 char const* pyperf() noexcept {
1359  return R"DELIM({
1360  "benchmarks": [
1361  {
1362  "runs": [
1363  {
1364  "values": [
1365 {{#measurement}} {{elapsed}}{{^-last}},
1366 {{/last}}{{/measurement}}
1367  ]
1368  }
1369  ]
1370  }
1371  ],
1372  "metadata": {
1373  "loops": {{sum(iterations)}},
1374  "inner_loops": {{batch}},
1375  "name": "{{title}}",
1376  "unit": "second"
1377  },
1378  "version": "1.0"
1379 })DELIM";
1380 }
1381 
1382 char const* json() noexcept {
1383  return R"DELIM({
1384  "results": [
1385 {{#result}} {
1386  "title": "{{title}}",
1387  "name": "{{name}}",
1388  "unit": "{{unit}}",
1389  "batch": {{batch}},
1390  "complexityN": {{complexityN}},
1391  "epochs": {{epochs}},
1392  "clockResolution": {{clockResolution}},
1393  "clockResolutionMultiple": {{clockResolutionMultiple}},
1394  "maxEpochTime": {{maxEpochTime}},
1395  "minEpochTime": {{minEpochTime}},
1396  "minEpochIterations": {{minEpochIterations}},
1397  "epochIterations": {{epochIterations}},
1398  "warmup": {{warmup}},
1399  "relative": {{relative}},
1400  "median(elapsed)": {{median(elapsed)}},
1401  "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1402  "median(instructions)": {{median(instructions)}},
1403  "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1404  "median(cpucycles)": {{median(cpucycles)}},
1405  "median(contextswitches)": {{median(contextswitches)}},
1406  "median(pagefaults)": {{median(pagefaults)}},
1407  "median(branchinstructions)": {{median(branchinstructions)}},
1408  "median(branchmisses)": {{median(branchmisses)}},
1409  "totalTime": {{sumProduct(iterations, elapsed)}},
1410  "measurements": [
1411 {{#measurement}} {
1412  "iterations": {{iterations}},
1413  "elapsed": {{elapsed}},
1414  "pagefaults": {{pagefaults}},
1415  "cpucycles": {{cpucycles}},
1416  "contextswitches": {{contextswitches}},
1417  "instructions": {{instructions}},
1418  "branchinstructions": {{branchinstructions}},
1419  "branchmisses": {{branchmisses}}
1420  }{{^-last}},{{/-last}}
1421 {{/measurement}} ]
1422  }{{^-last}},{{/-last}}
1423 {{/result}} ]
1424 })DELIM";
1425 }
1426 
1427 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1428 struct Node {
1429  enum class Type { tag, content, section, inverted_section };
1430 
1431  char const* begin;
1432  char const* end;
1433  std::vector<Node> children;
1434  Type type;
1435 
1436  template <size_t N>
1437  // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1438  bool operator==(char const (&str)[N]) const noexcept {
1439  return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1440  }
1441 };
1442 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1443 
1444 static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1445  std::vector<Node> nodes;
1446 
1447  while (true) {
1448  auto begin = std::strstr(*tpl, "{{");
1449  auto end = begin;
1450  if (begin != nullptr) {
1451  begin += 2;
1452  end = std::strstr(begin, "}}");
1453  }
1454 
1455  if (begin == nullptr || end == nullptr) {
1456  // nothing found, finish node
1457  nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1458  return nodes;
1459  }
1460 
1461  nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1462 
1463  // we found a tag
1464  *tpl = end + 2;
1465  switch (*begin) {
1466  case '/':
1467  // finished! bail out
1468  return nodes;
1469 
1470  case '#':
1471  nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1472  break;
1473 
1474  case '^':
1475  nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1476  break;
1477 
1478  default:
1479  nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1480  break;
1481  }
1482  }
1483 }
1484 
1485 static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1486  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1487  bool matchFirst = n == "-first";
1488  bool matchLast = n == "-last";
1489  if (!matchFirst && !matchLast) {
1490  return false;
1491  }
1492 
1493  bool doWrite = false;
1494  if (n.type == Node::Type::section) {
1495  doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1496  } else if (n.type == Node::Type::inverted_section) {
1497  doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1498  }
1499 
1500  if (doWrite) {
1501  for (auto const& child : n.children) {
1502  if (child.type == Node::Type::content) {
1503  out.write(child.begin, std::distance(child.begin, child.end));
1504  }
1505  }
1506  }
1507  return true;
1508 }
1509 
1510 static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1511  matchResult.clear();
1512  auto idxOpen = str.find('(');
1513  auto idxClose = str.find(')', idxOpen);
1514  if (idxClose == std::string::npos) {
1515  return false;
1516  }
1517 
1518  matchResult.emplace_back(str.substr(0, idxOpen));
1519 
1520  // split by comma
1521  matchResult.emplace_back(std::string{});
1522  for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1523  if (str[i] == ' ' || str[i] == '\t') {
1524  // skip whitespace
1525  continue;
1526  }
1527  if (str[i] == ',') {
1528  // got a comma => new string
1529  matchResult.emplace_back(std::string{});
1530  continue;
1531  }
1532  // no whitespace no comma, append
1533  matchResult.back() += str[i];
1534  }
1535  return true;
1536 }
1537 
1538 static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1539  using detail::d;
1540 
1541  if (n == "title") {
1542  out << config.mBenchmarkTitle;
1543  return true;
1544  } else if (n == "name") {
1545  out << config.mBenchmarkName;
1546  return true;
1547  } else if (n == "unit") {
1548  out << config.mUnit;
1549  return true;
1550  } else if (n == "batch") {
1551  out << config.mBatch;
1552  return true;
1553  } else if (n == "complexityN") {
1554  out << config.mComplexityN;
1555  return true;
1556  } else if (n == "epochs") {
1557  out << config.mNumEpochs;
1558  return true;
1559  } else if (n == "clockResolution") {
1560  out << d(detail::clockResolution());
1561  return true;
1562  } else if (n == "clockResolutionMultiple") {
1563  out << config.mClockResolutionMultiple;
1564  return true;
1565  } else if (n == "maxEpochTime") {
1566  out << d(config.mMaxEpochTime);
1567  return true;
1568  } else if (n == "minEpochTime") {
1569  out << d(config.mMinEpochTime);
1570  return true;
1571  } else if (n == "minEpochIterations") {
1572  out << config.mMinEpochIterations;
1573  return true;
1574  } else if (n == "epochIterations") {
1575  out << config.mEpochIterations;
1576  return true;
1577  } else if (n == "warmup") {
1578  out << config.mWarmup;
1579  return true;
1580  } else if (n == "relative") {
1581  out << config.mIsRelative;
1582  return true;
1583  }
1584  return false;
1585 }
1586 
1587 static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1588  if (generateConfigTag(n, r.config(), out)) {
1589  return out;
1590  }
1591  // match e.g. "median(elapsed)"
1592  // g++ 4.8 doesn't implement std::regex :(
1593  // static std::regex const regOpArg1("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\‍)$");
1594  // std::cmatch matchResult;
1595  // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1596  std::vector<std::string> matchResult;
1597  if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1598  if (matchResult.size() == 2) {
1599  auto m = Result::fromString(matchResult[1]);
1600  if (m == Result::Measure::_size) {
1601  return out << 0.0;
1602  }
1603 
1604  if (matchResult[0] == "median") {
1605  return out << r.median(m);
1606  }
1607  if (matchResult[0] == "average") {
1608  return out << r.average(m);
1609  }
1610  if (matchResult[0] == "medianAbsolutePercentError") {
1611  return out << r.medianAbsolutePercentError(m);
1612  }
1613  if (matchResult[0] == "sum") {
1614  return out << r.sum(m);
1615  }
1616  if (matchResult[0] == "minimum") {
1617  return out << r.minimum(m);
1618  }
1619  if (matchResult[0] == "maximum") {
1620  return out << r.maximum(m);
1621  }
1622  } else if (matchResult.size() == 3) {
1623  auto m1 = Result::fromString(matchResult[1]);
1624  auto m2 = Result::fromString(matchResult[2]);
1625  if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) {
1626  return out << 0.0;
1627  }
1628 
1629  if (matchResult[0] == "sumProduct") {
1630  return out << r.sumProduct(m1, m2);
1631  }
1632  }
1633  }
1634 
1635  // match e.g. "sumProduct(elapsed, iterations)"
1636  // static std::regex const regOpArg2("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\‍)$");
1637 
1638  // nothing matches :(
1639  throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1640 }
1641 
1642 static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1643  for (auto const& n : nodes) {
1644  if (!generateFirstLast(n, idx, r.size(), out)) {
1645  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1646  switch (n.type) {
1647  case Node::Type::content:
1648  out.write(n.begin, std::distance(n.begin, n.end));
1649  break;
1650 
1651  case Node::Type::inverted_section:
1652  throw std::runtime_error("got a inverted section inside measurement");
1653 
1654  case Node::Type::section:
1655  throw std::runtime_error("got a section inside measurement");
1656 
1657  case Node::Type::tag: {
1658  auto m = Result::fromString(std::string(n.begin, n.end));
1659  if (m == Result::Measure::_size || !r.has(m)) {
1660  out << 0.0;
1661  } else {
1662  out << r.get(idx, m);
1663  }
1664  break;
1665  }
1666  }
1667  }
1668  }
1669 }
1670 
1671 static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1672  auto const& r = results[idx];
1673  for (auto const& n : nodes) {
1674  if (!generateFirstLast(n, idx, results.size(), out)) {
1675  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1676  switch (n.type) {
1677  case Node::Type::content:
1678  out.write(n.begin, std::distance(n.begin, n.end));
1679  break;
1680 
1681  case Node::Type::inverted_section:
1682  throw std::runtime_error("got a inverted section inside result");
1683 
1684  case Node::Type::section:
1685  if (n == "measurement") {
1686  for (size_t i = 0; i < r.size(); ++i) {
1687  generateResultMeasurement(n.children, i, r, out);
1688  }
1689  } else {
1690  throw std::runtime_error("got a section inside result");
1691  }
1692  break;
1693 
1694  case Node::Type::tag:
1695  generateResultTag(n, r, out);
1696  break;
1697  }
1698  }
1699  }
1700 }
1701 
1702 } // namespace templates
1703 
1704 // helper stuff that only intended to be used internally
1705 namespace detail {
1706 
1707 char const* getEnv(char const* name);
1708 bool isEndlessRunning(std::string const& name);
1709 bool isWarningsEnabled();
1710 
1711 template <typename T>
1712 T parseFile(std::string const& filename);
1713 
1714 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1715 void printStabilityInformationOnce(std::ostream* os);
1716 
1717 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1718 uint64_t& singletonHeaderHash() noexcept;
1719 
1720 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1721 Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1722 
1723 // formatting utilities
1724 namespace fmt {
1725 
1726 // adds thousands separator to numbers
1727 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1728 class NumSep : public std::numpunct<char> {
1729 public:
1730  explicit NumSep(char sep);
1731  char do_thousands_sep() const override;
1732  std::string do_grouping() const override;
1733 
1734 private:
1735  char mSep;
1736 };
1737 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1738 
1739 // RAII to save & restore a stream's state
1740 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1741 class StreamStateRestorer {
1742 public:
1743  explicit StreamStateRestorer(std::ostream& s);
1744  ~StreamStateRestorer();
1745 
1746  // sets back all stream info that we remembered at construction
1747  void restore();
1748 
1749  // don't allow copying / moving
1750  StreamStateRestorer(StreamStateRestorer const&) = delete;
1751  StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1752  StreamStateRestorer(StreamStateRestorer&&) = delete;
1753  StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1754 
1755 private:
1756  std::ostream& mStream;
1757  std::locale mLocale;
1758  std::streamsize const mPrecision;
1759  std::streamsize const mWidth;
1760  std::ostream::char_type const mFill;
1761  std::ostream::fmtflags const mFmtFlags;
1762 };
1763 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1764 
1765 // Number formatter
1766 class Number {
1767 public:
1768  Number(int width, int precision, double value);
1769  Number(int width, int precision, int64_t value);
1770  std::string to_s() const;
1771 
1772 private:
1773  friend std::ostream& operator<<(std::ostream& os, Number const& n);
1774  std::ostream& write(std::ostream& os) const;
1775 
1776  int mWidth;
1777  int mPrecision;
1778  double mValue;
1779 };
1780 
1781 // helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1782 std::string to_s(uint64_t s);
1783 
1784 std::ostream& operator<<(std::ostream& os, Number const& n);
1785 
1786 class MarkDownColumn {
1787 public:
1788  MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val);
1789  std::string title() const;
1790  std::string separator() const;
1791  std::string invalid() const;
1792  std::string value() const;
1793 
1794 private:
1795  int mWidth;
1796  int mPrecision;
1797  std::string mTitle;
1798  std::string mSuffix;
1799  double mValue;
1800 };
1801 
1802 // Formats any text as markdown code, escaping backticks.
1803 class MarkDownCode {
1804 public:
1805  explicit MarkDownCode(std::string const& what);
1806 
1807 private:
1808  friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1809  std::ostream& write(std::ostream& os) const;
1810 
1811  std::string mWhat{};
1812 };
1813 
1814 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1815 
1816 } // namespace fmt
1817 } // namespace detail
1818 } // namespace nanobench
1819 } // namespace ankerl
1820 
1821 // implementation /////////////////////////////////////////////////////////////////////////////////
1822 
1823 namespace ankerl {
1824 namespace nanobench {
1825 
1826 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1827  detail::fmt::StreamStateRestorer restorer(out);
1828 
1829  out.precision(std::numeric_limits<double>::digits10);
1830  auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1831 
1832  for (auto const& n : nodes) {
1833  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1834  switch (n.type) {
1835  case templates::Node::Type::content:
1836  out.write(n.begin, std::distance(n.begin, n.end));
1837  break;
1838 
1839  case templates::Node::Type::inverted_section:
1840  throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
1841 
1842  case templates::Node::Type::section:
1843  if (n == "result") {
1844  const size_t nbResults = results.size();
1845  for (size_t i = 0; i < nbResults; ++i) {
1846  generateResult(n.children, i, results, out);
1847  }
1848  } else if (n == "measurement") {
1849  if (results.size() != 1) {
1850  throw std::runtime_error(
1851  "render: can only use section 'measurement' here if there is a single result, but there are " +
1852  detail::fmt::to_s(results.size()));
1853  }
1854  // when we only have a single result, we can immediately go into its measurement.
1855  auto const& r = results.front();
1856  for (size_t i = 0; i < r.size(); ++i) {
1857  generateResultMeasurement(n.children, i, r, out);
1858  }
1859  } else {
1860  throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
1861  }
1862  break;
1863 
1864  case templates::Node::Type::tag:
1865  if (results.size() == 1) {
1866  // result & config are both supported there
1867  generateResultTag(n, results.front(), out);
1868  } else {
1869  // This just uses the last result's config.
1870  if (!generateConfigTag(n, results.back().config(), out)) {
1871  throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
1872  }
1873  }
1874  break;
1875  }
1876  }
1877 }
1878 
1879 void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1880  render(mustacheTemplate.c_str(), results, out);
1881 }
1882 
1883 void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
1884  render(mustacheTemplate, bench.results(), out);
1885 }
1886 
1887 void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
1888  render(mustacheTemplate.c_str(), bench.results(), out);
1889 }
1890 
1891 namespace detail {
1892 
1893 PerformanceCounters& performanceCounters() {
1894 # if defined(__clang__)
1895 # pragma clang diagnostic push
1896 # pragma clang diagnostic ignored "-Wexit-time-destructors"
1897 # endif
1898  static PerformanceCounters pc;
1899 # if defined(__clang__)
1900 # pragma clang diagnostic pop
1901 # endif
1902  return pc;
1903 }
1904 
1905 // Windows version of doNotOptimizeAway
1906 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
1907 // see https://github.com/facebook/folly/blob/master/folly/Benchmark.h#L280
1908 // see https://docs.microsoft.com/en-us/cpp/preprocessor/optimize
1909 # if defined(_MSC_VER)
1910 # pragma optimize("", off)
1911 void doNotOptimizeAwaySink(void const*) {}
1912 # pragma optimize("", on)
1913 # endif
1914 
1915 template <typename T>
1916 T parseFile(std::string const& filename) {
1917  std::ifstream fin(filename);
1918  T num{};
1919  fin >> num;
1920  return num;
1921 }
1922 
1923 char const* getEnv(char const* name) {
1924 # if defined(_MSC_VER)
1925 # pragma warning(push)
1926 # pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
1927 # endif
1928  return std::getenv(name);
1929 # if defined(_MSC_VER)
1930 # pragma warning(pop)
1931 # endif
1932 }
1933 
1934 bool isEndlessRunning(std::string const& name) {
1935  auto endless = getEnv("NANOBENCH_ENDLESS");
1936  return nullptr != endless && endless == name;
1937 }
1938 
1939 // True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
1940 bool isWarningsEnabled() {
1941  auto suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
1942  return nullptr == suppression || suppression == std::string("0");
1943 }
1944 
1945 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
1946  warnings.clear();
1947  recommendations.clear();
1948 
1949  bool recommendCheckFlags = false;
1950 
1951 # if defined(DEBUG)
1952  warnings.emplace_back("DEBUG defined");
1953  recommendCheckFlags = true;
1954 # endif
1955 
1956  bool recommendPyPerf = false;
1957 # if defined(__linux__)
1958  auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
1959  if (nprocs <= 0) {
1960  warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
1961  } else {
1962 
1963  // check frequency scaling
1964  for (long id = 0; id < nprocs; ++id) {
1965  auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
1966  auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
1967  auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq");
1968  auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq");
1969  if (minFreq != maxFreq) {
1970  auto minMHz = static_cast<double>(minFreq) / 1000.0;
1971  auto maxMHz = static_cast<double>(maxFreq) / 1000.0;
1972  warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
1973  detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
1974  " MHz");
1975  recommendPyPerf = true;
1976  break;
1977  }
1978  }
1979 
1980  auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
1981  if ("performance" != currentGovernor) {
1982  warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
1983  recommendPyPerf = true;
1984  }
1985 
1986  if (0 == parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo")) {
1987  warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
1988  recommendPyPerf = true;
1989  }
1990  }
1991 # endif
1992 
1993  if (recommendCheckFlags) {
1994  recommendations.emplace_back("Make sure you compile for Release");
1995  }
1996  if (recommendPyPerf) {
1997  recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
1998  }
1999 }
2000 
2001 void printStabilityInformationOnce(std::ostream* outStream) {
2002  static bool shouldPrint = true;
2003  if (shouldPrint && outStream && isWarningsEnabled()) {
2004  auto& os = *outStream;
2005  shouldPrint = false;
2006  std::vector<std::string> warnings;
2007  std::vector<std::string> recommendations;
2008  gatherStabilityInformation(warnings, recommendations);
2009  if (warnings.empty()) {
2010  return;
2011  }
2012 
2013  os << "Warning, results might be unstable:" << std::endl;
2014  for (auto const& w : warnings) {
2015  os << "* " << w << std::endl;
2016  }
2017 
2018  os << std::endl << "Recommendations" << std::endl;
2019  for (auto const& r : recommendations) {
2020  os << "* " << r << std::endl;
2021  }
2022  }
2023 }
2024 
2025 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
2026 uint64_t& singletonHeaderHash() noexcept {
2027  static uint64_t sHeaderHash{};
2028  return sHeaderHash;
2029 }
2030 
2031 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2032 inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2033  return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2034 }
2035 
2036 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
2037 Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
2038  auto bestDuration = Clock::duration::max();
2039  Clock::time_point tBegin;
2040  Clock::time_point tEnd;
2041  for (size_t i = 0; i < numEvaluations; ++i) {
2042  tBegin = Clock::now();
2043  do {
2044  tEnd = Clock::now();
2045  } while (tBegin == tEnd);
2046  bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2047  }
2048  return bestDuration;
2049 }
2050 
2051 // Calculates clock resolution once, and remembers the result
2052 Clock::duration clockResolution() noexcept {
2053  static Clock::duration sResolution = calcClockResolution(20);
2054  return sResolution;
2055 }
2056 
2057 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2058 struct IterationLogic::Impl {
2059  enum class State { warmup, upscaling_runtime, measuring, endless };
2060 
2061  explicit Impl(Bench const& bench)
2062  : mBench(bench)
2063  , mResult(bench.config()) {
2064  printStabilityInformationOnce(mBench.output());
2065 
2066  // determine target runtime per epoch
2067  mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2068  if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2069  mTargetRuntimePerEpoch = mBench.maxEpochTime();
2070  }
2071  if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2072  mTargetRuntimePerEpoch = mBench.minEpochTime();
2073  }
2074 
2075  if (isEndlessRunning(mBench.name())) {
2076  std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
2077  mNumIters = (std::numeric_limits<uint64_t>::max)();
2078  mState = State::endless;
2079  } else if (0 != mBench.warmup()) {
2080  mNumIters = mBench.warmup();
2081  mState = State::warmup;
2082  } else if (0 != mBench.epochIterations()) {
2083  // exact number of iterations
2084  mNumIters = mBench.epochIterations();
2085  mState = State::measuring;
2086  } else {
2087  mNumIters = mBench.minEpochIterations();
2088  mState = State::upscaling_runtime;
2089  }
2090  }
2091 
2092  // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
2093  ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2094  auto doubleElapsed = d(elapsed);
2095  auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2096  auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2097 
2098  auto doubleMinEpochIters = d(mBench.minEpochIterations());
2099  if (doubleNewIters < doubleMinEpochIters) {
2100  doubleNewIters = doubleMinEpochIters;
2101  }
2102  doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2103 
2104  // +0.5 for correct rounding when casting
2105  // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2106  return static_cast<uint64_t>(doubleNewIters + 0.5);
2107  }
2108 
2109  ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {
2110  if (elapsed * 10 < mTargetRuntimePerEpoch) {
2111  // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2112  if (mNumIters * 10 < mNumIters) {
2113  // overflow :-(
2114  showResult("iterations overflow. Maybe your code got optimized away?");
2115  mNumIters = 0;
2116  return;
2117  }
2118  mNumIters *= 10;
2119  } else {
2120  mNumIters = calcBestNumIters(elapsed, mNumIters);
2121  }
2122  }
2123 
2124  void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2125 # if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2126  auto oldIters = mNumIters;
2127 # endif
2128 
2129  switch (mState) {
2130  case State::warmup:
2131  if (isCloseEnoughForMeasurements(elapsed)) {
2132  // if elapsed is close enough, we can skip upscaling and go right to measurements
2133  // still, we don't add the result to the measurements.
2134  mState = State::measuring;
2135  mNumIters = calcBestNumIters(elapsed, mNumIters);
2136  } else {
2137  // not close enough: switch to upscaling
2138  mState = State::upscaling_runtime;
2139  upscale(elapsed);
2140  }
2141  break;
2142 
2143  case State::upscaling_runtime:
2144  if (isCloseEnoughForMeasurements(elapsed)) {
2145  // if we are close enough, add measurement and switch to always measuring
2146  mState = State::measuring;
2147  mTotalElapsed += elapsed;
2148  mTotalNumIters += mNumIters;
2149  mResult.add(elapsed, mNumIters, pc);
2150  mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2151  } else {
2152  upscale(elapsed);
2153  }
2154  break;
2155 
2156  case State::measuring:
2157  // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2158  // that fluctuation, or else we would bias the result
2159  mTotalElapsed += elapsed;
2160  mTotalNumIters += mNumIters;
2161  mResult.add(elapsed, mNumIters, pc);
2162  if (0 != mBench.epochIterations()) {
2163  mNumIters = mBench.epochIterations();
2164  } else {
2165  mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2166  }
2167  break;
2168 
2169  case State::endless:
2170  mNumIters = (std::numeric_limits<uint64_t>::max)();
2171  break;
2172  }
2173 
2174  if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2175  // we got all the results that we need, finish it
2176  showResult("");
2177  mNumIters = 0;
2178  }
2179 
2180  ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, static_cast<double>(elapsed.count())) << " elapsed, "
2181  << detail::fmt::Number(20, 3, static_cast<double>(mTargetRuntimePerEpoch.count()))
2182  << " target. oldIters=" << oldIters << ", mNumIters=" << mNumIters
2183  << ", mState=" << static_cast<int>(mState));
2184  }
2185 
2186  void showResult(std::string const& errorMessage) const {
2187  ANKERL_NANOBENCH_LOG(errorMessage);
2188 
2189  if (mBench.output() != nullptr) {
2190  // prepare column data ///////
2191  std::vector<fmt::MarkDownColumn> columns;
2192 
2193  auto rMedian = mResult.median(Result::Measure::elapsed);
2194 
2195  if (mBench.relative()) {
2196  double d = 100.0;
2197  if (!mBench.results().empty()) {
2198  d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2199  }
2200  columns.emplace_back(11, 1, "relative", "%", d);
2201  }
2202 
2203  if (mBench.complexityN() > 0) {
2204  columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2205  }
2206 
2207  columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
2208  rMedian / (mBench.timeUnit().count() * mBench.batch()));
2209  columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2210 
2211  double rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2212  columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2213 
2214  double rInsMedian = -1.0;
2215  if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) {
2216  rInsMedian = mResult.median(Result::Measure::instructions);
2217  columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2218  }
2219 
2220  double rCycMedian = -1.0;
2221  if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) {
2222  rCycMedian = mResult.median(Result::Measure::cpucycles);
2223  columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2224  }
2225  if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2226  columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2227  }
2228  if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) {
2229  double rBraMedian = mResult.median(Result::Measure::branchinstructions);
2230  columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2231  if (mResult.has(Result::Measure::branchmisses)) {
2232  double p = 0.0;
2233  if (rBraMedian >= 1e-9) {
2234  p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2235  }
2236  columns.emplace_back(10, 1, "miss%", "%", p);
2237  }
2238  }
2239 
2240  columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
2241 
2242  // write everything
2243  auto& os = *mBench.output();
2244 
2245  // combine all elements that are relevant for printing the header
2246  uint64_t hash = 0;
2247  hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2248  hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2249  hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2250  hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2251  hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2252  hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2253 
2254  if (hash != singletonHeaderHash()) {
2255  singletonHeaderHash() = hash;
2256 
2257  // no result yet, print header
2258  os << std::endl;
2259  for (auto const& col : columns) {
2260  os << col.title();
2261  }
2262  os << "| " << mBench.title() << std::endl;
2263 
2264  for (auto const& col : columns) {
2265  os << col.separator();
2266  }
2267  os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2268  }
2269 
2270  if (!errorMessage.empty()) {
2271  for (auto const& col : columns) {
2272  os << col.invalid();
2273  }
2274  os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2275  } else {
2276  for (auto const& col : columns) {
2277  os << col.value();
2278  }
2279  os << "| ";
2280  auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2281  if (showUnstable) {
2282  os << ":wavy_dash: ";
2283  }
2284  os << fmt::MarkDownCode(mBench.name());
2285  if (showUnstable) {
2286  auto avgIters = static_cast<double>(mTotalNumIters) / static_cast<double>(mBench.epochs());
2287  // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2288  auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2289 
2290  os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2291  << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2292  }
2293  os << std::endl;
2294  }
2295  }
2296  }
2297 
2298  ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2299  return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2300  }
2301 
2302  uint64_t mNumIters = 1;
2303  Bench const& mBench;
2304  std::chrono::nanoseconds mTargetRuntimePerEpoch{};
2305  Result mResult;
2306  Rng mRng{123};
2307  std::chrono::nanoseconds mTotalElapsed{};
2308  uint64_t mTotalNumIters = 0;
2309 
2310  State mState = State::upscaling_runtime;
2311 };
2312 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2313 
2314 IterationLogic::IterationLogic(Bench const& bench) noexcept
2315  : mPimpl(new Impl(bench)) {}
2316 
2317 IterationLogic::~IterationLogic() {
2318  if (mPimpl) {
2319  delete mPimpl;
2320  }
2321 }
2322 
2323 uint64_t IterationLogic::numIters() const noexcept {
2324  ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2325  return mPimpl->mNumIters;
2326 }
2327 
2328 void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2329  mPimpl->add(elapsed, pc);
2330 }
2331 
2332 void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2333  results.emplace_back(std::move(mPimpl->mResult));
2334 }
2335 
2336 # if ANKERL_NANOBENCH(PERF_COUNTERS)
2337 
2338 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2339 class LinuxPerformanceCounters {
2340 public:
2341  struct Target {
2342  Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2343  : targetValue(targetValue_)
2344  , correctMeasuringOverhead(correctMeasuringOverhead_)
2345  , correctLoopOverhead(correctLoopOverhead_) {}
2346 
2347  uint64_t* targetValue{};
2348  bool correctMeasuringOverhead{};
2349  bool correctLoopOverhead{};
2350  };
2351 
2352  ~LinuxPerformanceCounters();
2353 
2354  // quick operation
2355  inline void start() {}
2356 
2357  inline void stop() {}
2358 
2359  bool monitor(perf_sw_ids swId, Target target);
2360  bool monitor(perf_hw_id hwId, Target target);
2361 
2362  bool hasError() const noexcept {
2363  return mHasError;
2364  }
2365 
2366  // Just reading data is faster than enable & disabling.
2367  // we subtract data ourselves.
2368  inline void beginMeasure() {
2369  if (mHasError) {
2370  return;
2371  }
2372 
2373  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2374  mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2375  if (mHasError) {
2376  return;
2377  }
2378 
2379  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2380  mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2381  }
2382 
2383  inline void endMeasure() {
2384  if (mHasError) {
2385  return;
2386  }
2387 
2388  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2389  mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2390  if (mHasError) {
2391  return;
2392  }
2393 
2394  auto const numBytes = sizeof(uint64_t) * mCounters.size();
2395  auto ret = read(mFd, mCounters.data(), numBytes);
2396  mHasError = ret != static_cast<ssize_t>(numBytes);
2397  }
2398 
2399  void updateResults(uint64_t numIters);
2400 
2401  // rounded integer division
2402  template <typename T>
2403  static inline T divRounded(T a, T divisor) {
2404  return (a + divisor / 2) / divisor;
2405  }
2406 
2407  ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2408  static inline uint32_t mix(uint32_t x) noexcept {
2409  x ^= x << 13;
2410  x ^= x >> 17;
2411  x ^= x << 5;
2412  return x;
2413  }
2414 
2415  template <typename Op>
2416  ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2417  void calibrate(Op&& op) {
2418  // clear current calibration data,
2419  for (auto& v : mCalibratedOverhead) {
2420  v = UINT64_C(0);
2421  }
2422 
2423  // create new calibration data
2424  auto newCalibration = mCalibratedOverhead;
2425  for (auto& v : newCalibration) {
2426  v = (std::numeric_limits<uint64_t>::max)();
2427  }
2428  for (size_t iter = 0; iter < 100; ++iter) {
2429  beginMeasure();
2430  op();
2431  endMeasure();
2432  if (mHasError) {
2433  return;
2434  }
2435 
2436  for (size_t i = 0; i < newCalibration.size(); ++i) {
2437  auto diff = mCounters[i];
2438  if (newCalibration[i] > diff) {
2439  newCalibration[i] = diff;
2440  }
2441  }
2442  }
2443 
2444  mCalibratedOverhead = std::move(newCalibration);
2445 
2446  {
2447  // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2448  // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2449  // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2450  // see https://godbolt.org/z/49RVQ5
2451  uint64_t const numIters = 100000U + (std::random_device{}() & 3);
2452  uint64_t n = numIters;
2453  uint32_t x = 1234567;
2454 
2455  beginMeasure();
2456  while (n-- > 0) {
2457  x = mix(x);
2458  }
2459  endMeasure();
2461  auto measure1 = mCounters;
2462 
2463  n = numIters;
2464  beginMeasure();
2465  while (n-- > 0) {
2466  // we now run *twice* so we can easily calculate the overhead
2467  x = mix(x);
2468  x = mix(x);
2469  }
2470  endMeasure();
2472  auto measure2 = mCounters;
2473 
2474  for (size_t i = 0; i < mCounters.size(); ++i) {
2475  // factor 2 because we have two instructions per loop
2476  auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2477  auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2478  auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2479 
2480  mLoopOverhead[i] = divRounded(overhead, numIters);
2481  }
2482  }
2483  }
2484 
2485 private:
2486  bool monitor(uint32_t type, uint64_t eventid, Target target);
2487 
2488  std::map<uint64_t, Target> mIdToTarget{};
2489 
2490  // start with minimum size of 3 for read_format
2491  std::vector<uint64_t> mCounters{3};
2492  std::vector<uint64_t> mCalibratedOverhead{3};
2493  std::vector<uint64_t> mLoopOverhead{3};
2494 
2495  uint64_t mTimeEnabledNanos = 0;
2496  uint64_t mTimeRunningNanos = 0;
2497  int mFd = -1;
2498  bool mHasError = false;
2499 };
2500 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2501 
2502 LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2503  if (-1 != mFd) {
2504  close(mFd);
2505  }
2506 }
2507 
2508 bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2509  return monitor(PERF_TYPE_SOFTWARE, swId, target);
2510 }
2511 
2512 bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2513  return monitor(PERF_TYPE_HARDWARE, hwId, target);
2514 }
2515 
2516 // overflow is ok, it's checked
2517 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2518 void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2519  // clear old data
2520  for (auto& id_value : mIdToTarget) {
2521  *id_value.second.targetValue = UINT64_C(0);
2522  }
2523 
2524  if (mHasError) {
2525  return;
2526  }
2527 
2528  mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2529  mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2530 
2531  for (uint64_t i = 0; i < mCounters[0]; ++i) {
2532  auto idx = static_cast<size_t>(3 + i * 2 + 0);
2533  auto id = mCounters[idx + 1U];
2534 
2535  auto it = mIdToTarget.find(id);
2536  if (it != mIdToTarget.end()) {
2537 
2538  auto& tgt = it->second;
2539  *tgt.targetValue = mCounters[idx];
2540  if (tgt.correctMeasuringOverhead) {
2541  if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2542  *tgt.targetValue -= mCalibratedOverhead[idx];
2543  } else {
2544  *tgt.targetValue = 0U;
2545  }
2546  }
2547  if (tgt.correctLoopOverhead) {
2548  auto correctionVal = mLoopOverhead[idx] * numIters;
2549  if (*tgt.targetValue >= correctionVal) {
2550  *tgt.targetValue -= correctionVal;
2551  } else {
2552  *tgt.targetValue = 0U;
2553  }
2554  }
2555  }
2556  }
2557 }
2558 
2559 bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2560  *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2561  if (mHasError) {
2562  return false;
2563  }
2564 
2565  auto pea = perf_event_attr();
2566  std::memset(&pea, 0, sizeof(perf_event_attr));
2567  pea.type = type;
2568  pea.size = sizeof(perf_event_attr);
2569  pea.config = eventid;
2570  pea.disabled = 1; // start counter as disabled
2571  pea.exclude_kernel = 1;
2572  pea.exclude_hv = 1;
2573 
2574  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2575  pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2576 
2577  const int pid = 0; // the current process
2578  const int cpu = -1; // all CPUs
2579 # if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2580  const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2581 # else
2582  const unsigned long flags = 0;
2583 # endif
2584 
2585  auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2586  if (-1 == fd) {
2587  return false;
2588  }
2589  if (-1 == mFd) {
2590  // first call: set to fd, and use this from now on
2591  mFd = fd;
2592  }
2593  uint64_t id = 0;
2594  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2595  if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2596  // couldn't get id
2597  return false;
2598  }
2599 
2600  // insert into map, rely on the fact that map's references are constant.
2601  mIdToTarget.emplace(id, target);
2602 
2603  // prepare readformat with the correct size (after the insert)
2604  auto size = 3 + 2 * mIdToTarget.size();
2605  mCounters.resize(size);
2606  mCalibratedOverhead.resize(size);
2607  mLoopOverhead.resize(size);
2608 
2609  return true;
2610 }
2611 
2612 PerformanceCounters::PerformanceCounters()
2613  : mPc(new LinuxPerformanceCounters())
2614  , mVal()
2615  , mHas() {
2616 
2617  mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2618  mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2619  mHas.contextSwitches =
2620  mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2621  mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2622  mHas.branchInstructions =
2623  mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2624  mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2625  // mHas.branchMisses = false;
2626 
2627  mPc->start();
2628  mPc->calibrate([] {
2629  auto before = ankerl::nanobench::Clock::now();
2630  auto after = ankerl::nanobench::Clock::now();
2631  (void)before;
2632  (void)after;
2633  });
2634 
2635  if (mPc->hasError()) {
2636  // something failed, don't monitor anything.
2637  mHas = PerfCountSet<bool>{};
2638  }
2639 }
2640 
2641 PerformanceCounters::~PerformanceCounters() {
2642  if (nullptr != mPc) {
2643  delete mPc;
2644  }
2645 }
2646 
2647 void PerformanceCounters::beginMeasure() {
2648  mPc->beginMeasure();
2649 }
2650 
2651 void PerformanceCounters::endMeasure() {
2652  mPc->endMeasure();
2653 }
2654 
2655 void PerformanceCounters::updateResults(uint64_t numIters) {
2656  mPc->updateResults(numIters);
2657 }
2658 
2659 # else
2660 
2661 PerformanceCounters::PerformanceCounters() = default;
2662 PerformanceCounters::~PerformanceCounters() = default;
2663 void PerformanceCounters::beginMeasure() {}
2664 void PerformanceCounters::endMeasure() {}
2665 void PerformanceCounters::updateResults(uint64_t) {}
2666 
2667 # endif
2668 
2669 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2670  return mVal;
2671 }
2672 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2673  return mHas;
2674 }
2675 
2676 // formatting utilities
2677 namespace fmt {
2678 
2679 // adds thousands separator to numbers
2680 NumSep::NumSep(char sep)
2681  : mSep(sep) {}
2682 
2683 char NumSep::do_thousands_sep() const {
2684  return mSep;
2685 }
2686 
2687 std::string NumSep::do_grouping() const {
2688  return "\003";
2689 }
2690 
2691 // RAII to save & restore a stream's state
2692 StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2693  : mStream(s)
2694  , mLocale(s.getloc())
2695  , mPrecision(s.precision())
2696  , mWidth(s.width())
2697  , mFill(s.fill())
2698  , mFmtFlags(s.flags()) {}
2699 
2700 StreamStateRestorer::~StreamStateRestorer() {
2701  restore();
2702 }
2703 
2704 // sets back all stream info that we remembered at construction
2705 void StreamStateRestorer::restore() {
2706  mStream.imbue(mLocale);
2707  mStream.precision(mPrecision);
2708  mStream.width(mWidth);
2709  mStream.fill(mFill);
2710  mStream.flags(mFmtFlags);
2711 }
2712 
2713 Number::Number(int width, int precision, int64_t value)
2714  : mWidth(width)
2715  , mPrecision(precision)
2716  , mValue(static_cast<double>(value)) {}
2717 
2718 Number::Number(int width, int precision, double value)
2719  : mWidth(width)
2720  , mPrecision(precision)
2721  , mValue(value) {}
2722 
2723 std::ostream& Number::write(std::ostream& os) const {
2724  StreamStateRestorer restorer(os);
2725  os.imbue(std::locale(os.getloc(), new NumSep(',')));
2726  os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2727  return os;
2728 }
2729 
2730 std::string Number::to_s() const {
2731  std::stringstream ss;
2732  write(ss);
2733  return ss.str();
2734 }
2735 
2736 std::string to_s(uint64_t n) {
2737  std::string str;
2738  do {
2739  str += static_cast<char>('0' + static_cast<char>(n % 10));
2740  n /= 10;
2741  } while (n != 0);
2742  std::reverse(str.begin(), str.end());
2743  return str;
2744 }
2745 
2746 std::ostream& operator<<(std::ostream& os, Number const& n) {
2747  return n.write(os);
2748 }
2749 
2750 MarkDownColumn::MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val)
2751  : mWidth(w)
2752  , mPrecision(prec)
2753  , mTitle(tit)
2754  , mSuffix(suff)
2755  , mValue(val) {}
2756 
2757 std::string MarkDownColumn::title() const {
2758  std::stringstream ss;
2759  ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2760  return ss.str();
2761 }
2762 
2763 std::string MarkDownColumn::separator() const {
2764  std::string sep(static_cast<size_t>(mWidth), '-');
2765  sep.front() = '|';
2766  sep.back() = ':';
2767  return sep;
2768 }
2769 
2770 std::string MarkDownColumn::invalid() const {
2771  std::string sep(static_cast<size_t>(mWidth), ' ');
2772  sep.front() = '|';
2773  sep[sep.size() - 2] = '-';
2774  return sep;
2775 }
2776 
2777 std::string MarkDownColumn::value() const {
2778  std::stringstream ss;
2779  auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2780  ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2781  return ss.str();
2782 }
2783 
2784 // Formats any text as markdown code, escaping backticks.
2785 MarkDownCode::MarkDownCode(std::string const& what) {
2786  mWhat.reserve(what.size() + 2);
2787  mWhat.push_back('`');
2788  for (char c : what) {
2789  mWhat.push_back(c);
2790  if ('`' == c) {
2791  mWhat.push_back('`');
2792  }
2793  }
2794  mWhat.push_back('`');
2795 }
2796 
2797 std::ostream& MarkDownCode::write(std::ostream& os) const {
2798  return os << mWhat;
2799 }
2800 
2801 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2802  return mdCode.write(os);
2803 }
2804 } // namespace fmt
2805 } // namespace detail
2806 
2807 // provide implementation here so it's only generated once
2808 Config::Config() = default;
2809 Config::~Config() = default;
2810 Config& Config::operator=(Config const&) = default;
2811 Config& Config::operator=(Config&&) = default;
2812 Config::Config(Config const&) = default;
2813 Config::Config(Config&&) noexcept = default;
2814 
2815 // provide implementation here so it's only generated once
2816 Result::~Result() = default;
2817 Result& Result::operator=(Result const&) = default;
2818 Result& Result::operator=(Result&&) = default;
2819 Result::Result(Result const&) = default;
2820 Result::Result(Result&&) noexcept = default;
2821 
2822 namespace detail {
2823 template <typename T>
2824 inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2825  return static_cast<typename std::underlying_type<T>::type>(val);
2826 }
2827 } // namespace detail
2828 
2829 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
2830 Result::Result(Config const& benchmarkConfig)
2831  : mConfig(benchmarkConfig)
2832  , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2833 
2834 void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
2835  using detail::d;
2836  using detail::u;
2837 
2838  double dIters = d(iters);
2839  mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2840 
2841  mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2842  if (pc.has().pageFaults) {
2843  mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2844  }
2845  if (pc.has().cpuCycles) {
2846  mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2847  }
2848  if (pc.has().contextSwitches) {
2849  mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2850  }
2851  if (pc.has().instructions) {
2852  mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2853  }
2854  if (pc.has().branchInstructions) {
2855  double branchInstructions = 0.0;
2856  // correcting branches: remove branch introduced by the while (...) loop for each iteration.
2857  if (pc.val().branchInstructions > iters + 1U) {
2858  branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2859  }
2860  mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2861 
2862  if (pc.has().branchMisses) {
2863  // correcting branch misses
2864  double branchMisses = d(pc.val().branchMisses);
2865  if (branchMisses > branchInstructions) {
2866  // can't have branch misses when there were branches...
2867  branchMisses = branchInstructions;
2868  }
2869 
2870  // assuming at least one missed branch for the loop
2871  branchMisses -= 1.0;
2872  if (branchMisses < 1.0) {
2873  branchMisses = 1.0;
2874  }
2875  mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2876  }
2877  }
2878 }
2879 
2880 Config const& Result::config() const noexcept {
2881  return mConfig;
2882 }
2883 
2884 inline double calcMedian(std::vector<double>& data) {
2885  if (data.empty()) {
2886  return 0.0;
2887  }
2888  std::sort(data.begin(), data.end());
2889 
2890  auto midIdx = data.size() / 2U;
2891  if (1U == (data.size() & 1U)) {
2892  return data[midIdx];
2893  }
2894  return (data[midIdx - 1U] + data[midIdx]) / 2U;
2895 }
2896 
2897 double Result::median(Measure m) const {
2898  // create a copy so we can sort
2899  auto data = mNameToMeasurements[detail::u(m)];
2900  return calcMedian(data);
2901 }
2902 
2903 double Result::average(Measure m) const {
2904  using detail::d;
2905  auto const& data = mNameToMeasurements[detail::u(m)];
2906  if (data.empty()) {
2907  return 0.0;
2908  }
2909 
2910  // create a copy so we can sort
2911  return sum(m) / d(data.size());
2912 }
2913 
2914 double Result::medianAbsolutePercentError(Measure m) const {
2915  // create copy
2916  auto data = mNameToMeasurements[detail::u(m)];
2917 
2918  // calculates MdAPE which is the median of percentage error
2919  // see https://www.spiderfinancial.com/support/documentation/numxl/reference-manual/forecasting-performance/mdape
2920  auto med = calcMedian(data);
2921 
2922  // transform the data to absolute error
2923  for (auto& x : data) {
2924  x = (x - med) / x;
2925  if (x < 0) {
2926  x = -x;
2927  }
2928  }
2929  return calcMedian(data);
2930 }
2931 
2932 double Result::sum(Measure m) const noexcept {
2933  auto const& data = mNameToMeasurements[detail::u(m)];
2934  return std::accumulate(data.begin(), data.end(), 0.0);
2935 }
2936 
2937 double Result::sumProduct(Measure m1, Measure m2) const noexcept {
2938  auto const& data1 = mNameToMeasurements[detail::u(m1)];
2939  auto const& data2 = mNameToMeasurements[detail::u(m2)];
2940 
2941  if (data1.size() != data2.size()) {
2942  return 0.0;
2943  }
2944 
2945  double result = 0.0;
2946  for (size_t i = 0, s = data1.size(); i != s; ++i) {
2947  result += data1[i] * data2[i];
2948  }
2949  return result;
2950 }
2951 
2952 bool Result::has(Measure m) const noexcept {
2953  return !mNameToMeasurements[detail::u(m)].empty();
2954 }
2955 
2956 double Result::get(size_t idx, Measure m) const {
2957  auto const& data = mNameToMeasurements[detail::u(m)];
2958  return data.at(idx);
2959 }
2960 
2961 bool Result::empty() const noexcept {
2962  return 0U == size();
2963 }
2964 
2965 size_t Result::size() const noexcept {
2966  auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
2967  return data.size();
2968 }
2969 
2970 double Result::minimum(Measure m) const noexcept {
2971  auto const& data = mNameToMeasurements[detail::u(m)];
2972  if (data.empty()) {
2973  return 0.0;
2974  }
2975 
2976  // here its save to assume that at least one element is there
2977  return *std::min_element(data.begin(), data.end());
2978 }
2979 
2980 double Result::maximum(Measure m) const noexcept {
2981  auto const& data = mNameToMeasurements[detail::u(m)];
2982  if (data.empty()) {
2983  return 0.0;
2984  }
2985 
2986  // here its save to assume that at least one element is there
2987  return *std::max_element(data.begin(), data.end());
2988 }
2989 
2990 Result::Measure Result::fromString(std::string const& str) {
2991  if (str == "elapsed") {
2992  return Measure::elapsed;
2993  } else if (str == "iterations") {
2994  return Measure::iterations;
2995  } else if (str == "pagefaults") {
2996  return Measure::pagefaults;
2997  } else if (str == "cpucycles") {
2998  return Measure::cpucycles;
2999  } else if (str == "contextswitches") {
3000  return Measure::contextswitches;
3001  } else if (str == "instructions") {
3002  return Measure::instructions;
3003  } else if (str == "branchinstructions") {
3004  return Measure::branchinstructions;
3005  } else if (str == "branchmisses") {
3006  return Measure::branchmisses;
3007  } else {
3008  // not found, return _size
3009  return Measure::_size;
3010  }
3011 }
3012 
3013 // Configuration of a microbenchmark.
3014 Bench::Bench() {
3015  mConfig.mOut = &std::cout;
3016 }
3017 
3018 Bench::Bench(Bench&&) = default;
3019 Bench& Bench::operator=(Bench&&) = default;
3020 Bench::Bench(Bench const&) = default;
3021 Bench& Bench::operator=(Bench const&) = default;
3022 Bench::~Bench() noexcept = default;
3023 
3024 double Bench::batch() const noexcept {
3025  return mConfig.mBatch;
3026 }
3027 
3028 double Bench::complexityN() const noexcept {
3029  return mConfig.mComplexityN;
3030 }
3031 
3032 // Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
3033 // means it is slower than the baseline.
3034 Bench& Bench::relative(bool isRelativeEnabled) noexcept {
3035  mConfig.mIsRelative = isRelativeEnabled;
3036  return *this;
3037 }
3038 bool Bench::relative() const noexcept {
3039  return mConfig.mIsRelative;
3040 }
3041 
3042 Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
3043  mConfig.mShowPerformanceCounters = showPerformanceCounters;
3044  return *this;
3045 }
3046 bool Bench::performanceCounters() const noexcept {
3047  return mConfig.mShowPerformanceCounters;
3048 }
3049 
3050 // Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
3051 // If u differs from currently set unit, the stored results will be cleared.
3052 // Use singular (byte, not bytes).
3053 Bench& Bench::unit(char const* u) {
3054  if (u != mConfig.mUnit) {
3055  mResults.clear();
3056  }
3057  mConfig.mUnit = u;
3058  return *this;
3059 }
3060 
3061 Bench& Bench::unit(std::string const& u) {
3062  return unit(u.c_str());
3063 }
3064 
3065 std::string const& Bench::unit() const noexcept {
3066  return mConfig.mUnit;
3067 }
3068 
3069 Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
3070  mConfig.mTimeUnit = tu;
3071  mConfig.mTimeUnitName = tuName;
3072  return *this;
3073 }
3074 
3075 std::string const& Bench::timeUnitName() const noexcept {
3076  return mConfig.mTimeUnitName;
3077 }
3078 
3079 std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
3080  return mConfig.mTimeUnit;
3081 }
3082 
3083 // If benchmarkTitle differs from currently set title, the stored results will be cleared.
3084 Bench& Bench::title(const char* benchmarkTitle) {
3085  if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3086  mResults.clear();
3087  }
3088  mConfig.mBenchmarkTitle = benchmarkTitle;
3089  return *this;
3090 }
3091 Bench& Bench::title(std::string const& benchmarkTitle) {
3092  if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3093  mResults.clear();
3094  }
3095  mConfig.mBenchmarkTitle = benchmarkTitle;
3096  return *this;
3097 }
3098 
3099 std::string const& Bench::title() const noexcept {
3100  return mConfig.mBenchmarkTitle;
3101 }
3102 
3103 Bench& Bench::name(const char* benchmarkName) {
3104  mConfig.mBenchmarkName = benchmarkName;
3105  return *this;
3106 }
3107 
3108 Bench& Bench::name(std::string const& benchmarkName) {
3109  mConfig.mBenchmarkName = benchmarkName;
3110  return *this;
3111 }
3112 
3113 std::string const& Bench::name() const noexcept {
3114  return mConfig.mBenchmarkName;
3115 }
3116 
3117 // Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
3118 Bench& Bench::epochs(size_t numEpochs) noexcept {
3119  mConfig.mNumEpochs = numEpochs;
3120  return *this;
3121 }
3122 size_t Bench::epochs() const noexcept {
3123  return mConfig.mNumEpochs;
3124 }
3125 
3126 // Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
3127 Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3128  mConfig.mClockResolutionMultiple = multiple;
3129  return *this;
3130 }
3131 size_t Bench::clockResolutionMultiple() const noexcept {
3132  return mConfig.mClockResolutionMultiple;
3133 }
3134 
3135 // Sets the maximum time each epoch should take. Default is 100ms.
3136 Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3137  mConfig.mMaxEpochTime = t;
3138  return *this;
3139 }
3140 std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3141  return mConfig.mMaxEpochTime;
3142 }
3143 
3144 // Sets the maximum time each epoch should take. Default is 100ms.
3145 Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3146  mConfig.mMinEpochTime = t;
3147  return *this;
3148 }
3149 std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3150  return mConfig.mMinEpochTime;
3151 }
3152 
3153 Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3154  mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3155  return *this;
3156 }
3157 uint64_t Bench::minEpochIterations() const noexcept {
3158  return mConfig.mMinEpochIterations;
3159 }
3160 
3161 Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3162  mConfig.mEpochIterations = numIters;
3163  return *this;
3164 }
3165 uint64_t Bench::epochIterations() const noexcept {
3166  return mConfig.mEpochIterations;
3167 }
3168 
3169 Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3170  mConfig.mWarmup = numWarmupIters;
3171  return *this;
3172 }
3173 uint64_t Bench::warmup() const noexcept {
3174  return mConfig.mWarmup;
3175 }
3176 
3177 Bench& Bench::config(Config const& benchmarkConfig) {
3178  mConfig = benchmarkConfig;
3179  return *this;
3180 }
3181 Config const& Bench::config() const noexcept {
3182  return mConfig;
3183 }
3184 
3185 Bench& Bench::output(std::ostream* outstream) noexcept {
3186  mConfig.mOut = outstream;
3187  return *this;
3188 }
3189 
3190 ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3191  return mConfig.mOut;
3192 }
3193 
3194 std::vector<Result> const& Bench::results() const noexcept {
3195  return mResults;
3196 }
3197 
3198 Bench& Bench::render(char const* templateContent, std::ostream& os) {
3199  ::ankerl::nanobench::render(templateContent, *this, os);
3200  return *this;
3201 }
3202 
3203 Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
3204  ::ankerl::nanobench::render(templateContent, *this, os);
3205  return *this;
3206 }
3207 
3208 std::vector<BigO> Bench::complexityBigO() const {
3209  std::vector<BigO> bigOs;
3210  auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3211  bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3212  return 1.0;
3213  });
3214  bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3215  return n;
3216  });
3217  bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3218  return std::log2(n);
3219  });
3220  bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3221  return n * std::log2(n);
3222  });
3223  bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3224  return n * n;
3225  });
3226  bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3227  return n * n * n;
3228  });
3229  std::sort(bigOs.begin(), bigOs.end());
3230  return bigOs;
3231 }
3232 
3233 Rng::Rng()
3234  : mX(0)
3235  , mY(0) {
3236  std::random_device rd;
3237  std::uniform_int_distribution<uint64_t> dist;
3238  do {
3239  mX = dist(rd);
3240  mY = dist(rd);
3241  } while (mX == 0 && mY == 0);
3242 }
3243 
3244 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
3245 uint64_t splitMix64(uint64_t& state) noexcept {
3246  uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3247  z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3248  z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3249  return z ^ (z >> 31U);
3250 }
3251 
3252 // Seeded as described in romu paper (update april 2020)
3253 Rng::Rng(uint64_t seed) noexcept
3254  : mX(splitMix64(seed))
3255  , mY(splitMix64(seed)) {
3256  for (size_t i = 0; i < 10; ++i) {
3257  operator()();
3258  }
3259 }
3260 
3261 // only internally used to copy the RNG.
3262 Rng::Rng(uint64_t x, uint64_t y) noexcept
3263  : mX(x)
3264  , mY(y) {}
3265 
3266 Rng Rng::copy() const noexcept {
3267  return Rng{mX, mY};
3268 }
3269 
3270 Rng::Rng(std::vector<uint64_t> const& data)
3271  : mX(0)
3272  , mY(0) {
3273  if (data.size() != 2) {
3274  throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3275  detail::fmt::to_s(data.size()));
3276  }
3277  mX = data[0];
3278  mY = data[1];
3279 }
3280 
3281 std::vector<uint64_t> Rng::state() const {
3282  std::vector<uint64_t> data(2);
3283  data[0] = mX;
3284  data[1] = mY;
3285  return data;
3286 }
3287 
3288 BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3289  BigO::RangeMeasure rangeMeasure;
3290  for (auto const& result : results) {
3291  if (result.config().mComplexityN > 0.0) {
3292  rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3293  }
3294  }
3295  return rangeMeasure;
3296 }
3297 
3298 BigO::BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure)
3299  : mName(bigOName) {
3300 
3301  // estimate the constant factor
3302  double sumRangeMeasure = 0.0;
3303  double sumRangeRange = 0.0;
3304 
3305  for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3306  sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second;
3307  sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first;
3308  }
3309  mConstant = sumRangeMeasure / sumRangeRange;
3310 
3311  // calculate root mean square
3312  double err = 0.0;
3313  double sumMeasure = 0.0;
3314  for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3315  auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second;
3316  err += diff * diff;
3317 
3318  sumMeasure += rangeMeasure[i].second;
3319  }
3320 
3321  auto n = static_cast<double>(rangeMeasure.size());
3322  auto mean = sumMeasure / n;
3323  mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3324 }
3325 
3326 BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3327  : BigO(std::string(bigOName), rangeMeasure) {}
3328 
3329 std::string const& BigO::name() const noexcept {
3330  return mName;
3331 }
3332 
3333 double BigO::constant() const noexcept {
3334  return mConstant;
3335 }
3336 
3337 double BigO::normalizedRootMeanSquare() const noexcept {
3338  return mNormalizedRootMeanSquare;
3339 }
3340 
3341 bool BigO::operator<(BigO const& other) const noexcept {
3342  return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3343 }
3344 
3345 std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3346  return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3347 }
3348 
3349 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3350  detail::fmt::StreamStateRestorer restorer(os);
3351  os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3352  for (auto const& bigO : bigOs) {
3353  os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3354  os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3355  os << "| " << bigO.name();
3356  os << std::endl;
3357  }
3358  return os;
3359 }
3360 
3361 } // namespace nanobench
3362 } // namespace ankerl
3363 
3364 #endif // ANKERL_NANOBENCH_IMPLEMENT
3365 #endif // ANKERL_NANOBENCH_H_INCLUDED
int flags
Definition: bitcoin-tx.cpp:533
Definition: config.h:17
Config()=default
Config & operator=(const Config &)=delete
Main entry point to nanobench's benchmarking facility.
Definition: nanobench.h:616
Bench & complexityN(T b) noexcept
Definition: nanobench.h:1214
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Definition: nanobench.h:1183
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
Bench & operator=(Bench &&other)
Bench()
Creates a new benchmark for configuration and running of benchmarks.
ANKERL_NANOBENCH(NODISCARD) std Bench & batch(T b) noexcept
Sets the batch size.
std::vector< BigO > complexityBigO() const
Bench & operator=(Bench const &other)
Bench(Bench const &other)
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
Definition: nanobench.h:1070
std::vector< std::pair< double, double > > RangeMeasure
Definition: nanobench.h:1067
BigO(std::string const &bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1084
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1080
BigO(std::string const &bigOName, RangeMeasure const &scaledRangeMeasure)
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
BigO(char const *bigOName, RangeMeasure const &scaledRangeMeasure)
Result(Result const &)
static Measure fromString(std::string const &str)
Result(Config const &benchmarkConfig)
Result & operator=(Result &&)
Result(Result &&) noexcept
Result & operator=(Result const &)
An extremely fast random generator.
Definition: nanobench.h:477
static constexpr uint64_t() min()
Rng(Rng const &)=delete
As a safety precausion, we don't allow copying.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Definition: nanobench.h:1145
Rng(Rng &&) noexcept=default
Rng & operator=(Rng const &)=delete
Same as Rng(Rng const&), we don't allow assignment.
static constexpr uint64_t() max()
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
Definition: nanobench.h:1135
uint64_t result_type
This RNG provides 64bit randomness.
Definition: nanobench.h:482
void moveResultTo(std::vector< Result > &results) noexcept
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept
IterationLogic(Bench const &config) noexcept
PerformanceCounters & operator=(PerformanceCounters const &)=delete
PerformanceCounters(PerformanceCounters const &)=delete
ANKERL_NANOBENCH(NODISCARD) PerfCountSet< uint64_t > const &val() const noexcept
volatile double sum
Definition: examples.cpp:10
PerformanceCounters & performanceCounters()
void doNotOptimizeAway(T const &val)
Definition: nanobench.h:999
char const * json() noexcept
Template to generate JSON data.
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzations.
char const * csv() noexcept
CSV data for the benchmark results.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
Definition: nanobench.h:128
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Definition: nanobench.h:1228
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
Implement std::hash so RCUPtr can be used as a key for maps or sets.
Definition: rcu.h:257
#define ANKERL_NANOBENCH_LOG(x)
Definition: nanobench.h:86
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Definition: nanobench.h:105
#define ANKERL_NANOBENCH(x)
Definition: nanobench.h:48
bool operator==(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:677
bool operator<(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:681
const char * name
Definition: rest.cpp:48
static RPCHelpMan stop()
Definition: server.cpp:209
Config & operator=(Config const &)
Config(Config &&) noexcept
Config(Config const &)
Config & operator=(Config &&)
static int count
Definition: tests.c:31