Commit 7a767012 by Eric Fiselier

Adopt new benchmark timing internals.

This patch adopts a new internal structure for how timings are performed. Currently every iteration of a benchmark checks to see if it has been running for an appropriate amount of time. Checking the clock introduces noise into the timings and this can cause inconsistent output from each benchmark. Now every iteration of a benchmark only checks an iteration count to see if it should stop running. The iteration count is determined before hand by testing the benchmark on a series of increasing iteration counts until a suitable count is found. This increases the amount of time it takes to run the actual benchmarks but it also greatly increases the accuracy of the results. This patch introduces some breaking changes. The notable breaking changes are: 1. Benchmarks run on multiple threads no generate a report per thread. Instead only a single report is generated. 2. ::benchmark::UseRealTime() was removed and replaced with State::UseRealTime().
parent 7c6a7e30
...@@ -44,6 +44,10 @@ add_cxx_compiler_flag(-pedantic-errors) ...@@ -44,6 +44,10 @@ add_cxx_compiler_flag(-pedantic-errors)
add_cxx_compiler_flag(-fno-strict-aliasing RELEASE) add_cxx_compiler_flag(-fno-strict-aliasing RELEASE)
add_cxx_compiler_flag(-Wthread-safety) add_cxx_compiler_flag(-Wthread-safety)
if (HAVE_WTHREAD_SAFETY)
add_definitions(-DHAVE_WTHREAD_SAFETY)
cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
endif()
# C++ feature checks # C++ feature checks
cxx_feature_check(STD_REGEX) cxx_feature_check(STD_REGEX)
......
#define HAVE_THREAD_SAFETY_ATTRIBUTES
#include "../src/mutex.h"
int main() {}
...@@ -135,7 +135,8 @@ BENCHMARK(BM_MultiThreaded)->Threads(4); ...@@ -135,7 +135,8 @@ BENCHMARK(BM_MultiThreaded)->Threads(4);
#ifndef BENCHMARK_BENCHMARK_H_ #ifndef BENCHMARK_BENCHMARK_H_
#define BENCHMARK_BENCHMARK_H_ #define BENCHMARK_BENCHMARK_H_
#include <stdint.h> #include <cassert>
#include <cstdint>
#include <functional> #include <functional>
#include <memory> #include <memory>
...@@ -153,10 +154,7 @@ void Initialize(int* argc, const char** argv); ...@@ -153,10 +154,7 @@ void Initialize(int* argc, const char** argv);
// Otherwise, run all benchmarks specified by the --benchmark_filter flag, // Otherwise, run all benchmarks specified by the --benchmark_filter flag,
// and exit after running the benchmarks. // and exit after running the benchmarks.
void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = nullptr); void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = NULL);
// ------------------------------------------------------
// Routines that can be called from within a benchmark
// If this routine is called, peak memory allocation past this point in the // If this routine is called, peak memory allocation past this point in the
// benchmark is reported at the end of the benchmark report line. (It is // benchmark is reported at the end of the benchmark report line. (It is
...@@ -165,14 +163,6 @@ void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = nullptr); ...@@ -165,14 +163,6 @@ void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = nullptr);
// TODO(dominic) // TODO(dominic)
// void MemoryUsage(); // void MemoryUsage();
// If a particular benchmark is I/O bound, or if for some reason CPU
// timings are not representative, call this method from within the
// benchmark routine. If called, the elapsed time will be used to
// control how many iterations are run, and in the printing of
// items/second or MB/seconds values. If not called, the cpu time
// used by the benchmark will be used.
void UseRealTime();
namespace internal { namespace internal {
class Benchmark; class Benchmark;
class BenchmarkFamilies; class BenchmarkFamilies;
...@@ -181,13 +171,63 @@ class BenchmarkFamilies; ...@@ -181,13 +171,63 @@ class BenchmarkFamilies;
// State is passed to a running Benchmark and contains state for the // State is passed to a running Benchmark and contains state for the
// benchmark to use. // benchmark to use.
class State { class State {
public: public:
State(size_t max_iters, bool has_x, int x, bool has_y, int y, int thread_i);
// Returns true iff the benchmark should continue through another iteration. // Returns true iff the benchmark should continue through another iteration.
bool KeepRunning(); // NOTE: A benchmark may not return from the test until KeepRunning() has
// returned false.
bool KeepRunning() {
if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
ResumeTiming();
started_ = true;
}
bool const res = total_iterations_++ < max_iterations;
if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
assert(started_);
PauseTiming();
// Total iterations now is one greater than max iterations. Fix this.
total_iterations_ = max_iterations;
}
return res;
}
// REQUIRES: timer is running
// Stop the benchmark timer. If not called, the timer will be
// automatically stopped after KeepRunning() returns false for the first time.
//
// For threaded benchmarks the PauseTiming() function acts
// like a barrier. I.e., the ith call by a particular thread to this
// function will block until all threads have made their ith call.
// The timer will stop when the last thread has called this function.
//
// NOTE: PauseTiming()/ResumeTiming() are relatively
// heavyweight, and so their use should generally be avoided
// within each benchmark iteration, if possible.
void PauseTiming(); void PauseTiming();
// REQUIRES: timer is not running
// Start the benchmark timer. The timer is NOT running on entrance to the
// benchmark function. It begins running after the first call to KeepRunning()
//
// For threaded benchmarks the ResumeTiming() function acts
// like a barrier. I.e., the ith call by a particular thread to this
// function will block until all threads have made their ith call.
// The timer will start when the last thread has called this function.
//
// NOTE: PauseTiming()/ResumeTiming() are relatively
// heavyweight, and so their use should generally be avoided
// within each benchmark iteration, if possible.
void ResumeTiming(); void ResumeTiming();
// If a particular benchmark is I/O bound, or if for some reason CPU
// timings are not representative, call this method from within the
// benchmark routine. If called, the elapsed time will be used to
// control how many iterations are run, and in the printing of
// items/second or MB/seconds values. If not called, the cpu time
// used by the benchmark will be used.
void UseRealTime();
// Set the number of bytes processed by the current benchmark // Set the number of bytes processed by the current benchmark
// execution. This routine is typically called once at the end of a // execution. This routine is typically called once at the end of a
// throughput oriented benchmark. If this routine is called with a // throughput oriented benchmark. If this routine is called with a
...@@ -195,7 +235,15 @@ class State { ...@@ -195,7 +235,15 @@ class State {
// per iteration. // per iteration.
// //
// REQUIRES: a benchmark has exited its KeepRunning loop. // REQUIRES: a benchmark has exited its KeepRunning loop.
void SetBytesProcessed(int64_t bytes); BENCHMARK_ALWAYS_INLINE
void SetBytesProcessed(size_t bytes) {
bytes_processed_ = bytes;
}
BENCHMARK_ALWAYS_INLINE
size_t bytes_processed() const {
return bytes_processed_;
}
// If this routine is called with items > 0, then an items/s // If this routine is called with items > 0, then an items/s
// label is printed on the benchmark report line for the currently // label is printed on the benchmark report line for the currently
...@@ -203,94 +251,76 @@ class State { ...@@ -203,94 +251,76 @@ class State {
// benchmark where a processing items/second output is desired. // benchmark where a processing items/second output is desired.
// //
// REQUIRES: a benchmark has exited its KeepRunning loop. // REQUIRES: a benchmark has exited its KeepRunning loop.
void SetItemsProcessed(int64_t items); BENCHMARK_ALWAYS_INLINE
void SetItemsProcessed(size_t items) {
items_processed_ = items;
}
BENCHMARK_ALWAYS_INLINE
size_t items_processed() const {
return items_processed_;
}
// If this routine is called, the specified label is printed at the // If this routine is called, the specified label is printed at the
// end of the benchmark report line for the currently executing // end of the benchmark report line for the currently executing
// benchmark. Example: // benchmark. Example:
// static void BM_Compress(benchmark::State& state) { // static void BM_Compress(int iters) {
// ... // ...
// double compress = input_size / output_size; // double compress = input_size / output_size;
// state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression)); // benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
// } // }
// Produces output that looks like: // Produces output that looks like:
// BM_Compress 50 50 14115038 compress:27.3% // BM_Compress 50 50 14115038 compress:27.3%
// //
// REQUIRES: a benchmark has exited its KeepRunning loop. // REQUIRES: a benchmark has exited its KeepRunning loop.
void SetLabel(const std::string& label); void SetLabel(const char* label);
// Allow the use of std::string without actually including <string>.
// This function does not participate in overload resolution unless StringType
// has the nested typename `basic_string`. This typename should be provided
// as an injected class name in the case of std::string.
template <class StringType>
void SetLabel(StringType const & str,
typename StringType::basic_string* = 0) {
this->SetLabel(str.c_str());
}
// Range arguments for this run. CHECKs if the argument has been set. // Range arguments for this run. CHECKs if the argument has been set.
int range_x() const; BENCHMARK_ALWAYS_INLINE
int range_y() const; int range_x() const {
assert(has_range_x_);
int64_t iterations() const { return total_iterations_; } ((void)has_range_x_); // Prevent unused warning.
return range_x_;
const int thread_index; }
private:
class FastClock;
struct SharedState;
struct ThreadStats;
State(FastClock* clock, SharedState* s, int t);
bool StartRunning();
bool FinishInterval();
bool MaybeStop();
void NewInterval();
bool AllStarting();
static void* RunWrapper(void* arg);
void Run();
void RunAsThread();
void Wait();
enum EState {
STATE_INITIAL, // KeepRunning hasn't been called
STATE_STARTING, // KeepRunning called, waiting for other threads
STATE_RUNNING, // Running and being timed
STATE_STOPPING, // Not being timed but waiting for other threads
STATE_STOPPED // Stopped
};
EState state_;
FastClock* clock_;
// State shared by all BenchmarkRun objects that belong to the same
// BenchmarkInstance
SharedState* shared_;
std::thread thread_;
// Custom label set by the user. BENCHMARK_ALWAYS_INLINE
std::string label_; int range_y() const {
assert(has_range_y_);
((void)has_range_y_); // Prevent unused warning.
return range_y_;
}
// Each State object goes through a sequence of measurement intervals. By BENCHMARK_ALWAYS_INLINE
// default each interval is approx. 100ms in length. The following stats are size_t iterations() const { return total_iterations_; }
// kept for each interval.
int64_t iterations_;
double start_cpu_;
double start_time_;
int64_t stop_time_micros_;
double start_pause_cpu_; private:
double pause_cpu_time_; bool started_;
double start_pause_real_; size_t total_iterations_;
double pause_real_time_;
// Total number of iterations for all finished runs. bool has_range_x_;
int64_t total_iterations_; int range_x_;
// Approximate time in microseconds for one interval of execution. bool has_range_y_;
// Dynamically adjusted as needed. int range_y_;
int64_t interval_micros_;
// True if the current interval is the continuation of a previous one. size_t bytes_processed_;
bool is_continuation_; size_t items_processed_;
std::unique_ptr<ThreadStats> stats_; public:
const int thread_index;
const size_t max_iterations;
friend class internal::Benchmark; private:
BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State); BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
}; };
...@@ -304,7 +334,6 @@ class BenchmarkReporter { ...@@ -304,7 +334,6 @@ class BenchmarkReporter {
struct Context { struct Context {
int num_cpus; int num_cpus;
double mhz_per_cpu; double mhz_per_cpu;
// std::string cpu_info;
bool cpu_scaling_enabled; bool cpu_scaling_enabled;
// The number of chars in the longest benchmark name. // The number of chars in the longest benchmark name.
...@@ -312,19 +341,17 @@ class BenchmarkReporter { ...@@ -312,19 +341,17 @@ class BenchmarkReporter {
}; };
struct Run { struct Run {
Run() Run() :
: thread_index(-1), iterations(1),
iterations(1), real_accumulated_time(0),
real_accumulated_time(0), cpu_accumulated_time(0),
cpu_accumulated_time(0), bytes_per_second(0),
bytes_per_second(0), items_per_second(0),
items_per_second(0), max_heapbytes_used(0) {}
max_heapbytes_used(0) {}
std::string benchmark_name; std::string benchmark_name;
std::string report_label; std::string report_label; // Empty if not set by benchmark.
int thread_index; size_t iterations;
int64_t iterations;
double real_accumulated_time; double real_accumulated_time;
double cpu_accumulated_time; double cpu_accumulated_time;
...@@ -350,22 +377,12 @@ class BenchmarkReporter { ...@@ -350,22 +377,12 @@ class BenchmarkReporter {
// benchmark, thus have the same name. // benchmark, thus have the same name.
virtual void ReportRuns(const std::vector<Run>& report) const = 0; virtual void ReportRuns(const std::vector<Run>& report) const = 0;
virtual ~BenchmarkReporter() {} virtual ~BenchmarkReporter();
}; };
namespace internal { namespace internal {
typedef std::function<void(State&)> BenchmarkFunction; typedef void(Function)(State&);
// Run all benchmarks whose name is a partial match for the regular
// expression in "spec". The results of benchmark runs are fed to "reporter".
void RunMatchingBenchmarks(const std::string& spec,
const BenchmarkReporter* reporter);
// Extract the list of benchmark names that match the specified regular
// expression.
void FindMatchingBenchmarkNames(const std::string& re,
std::vector<std::string>* benchmark_names);
// ------------------------------------------------------ // ------------------------------------------------------
// Benchmark registration object. The BENCHMARK() macro expands // Benchmark registration object. The BENCHMARK() macro expands
...@@ -375,8 +392,7 @@ void FindMatchingBenchmarkNames(const std::string& re, ...@@ -375,8 +392,7 @@ void FindMatchingBenchmarkNames(const std::string& re,
// chained into one expression. // chained into one expression.
class Benchmark { class Benchmark {
public: public:
// The Benchmark takes ownership of the Callback pointed to by f. Benchmark(const char* name, Function* f);
Benchmark(const char* name, BenchmarkFunction f);
~Benchmark(); ~Benchmark();
...@@ -444,40 +460,25 @@ class Benchmark { ...@@ -444,40 +460,25 @@ class Benchmark {
// Used inside the benchmark implementation // Used inside the benchmark implementation
struct Instance; struct Instance;
// Measure the overhead of an empty benchmark to subtract later.
static void MeasureOverhead();
private: private:
friend class BenchmarkFamilies;
std::vector<Benchmark::Instance> CreateBenchmarkInstances(size_t rangeXindex,
size_t rangeYindex);
std::string name_; std::string name_;
BenchmarkFunction function_; Function* function_;
size_t registration_index_; std::size_t registration_index_;
std::vector<int> rangeX_; int arg_count_;
std::vector<int> rangeY_; std::vector< std::pair<int, int> > args_; // Args for all benchmark runs
std::vector<int> thread_counts_; std::vector<int> thread_counts_;
std::mutex mutex_;
// Special value placed in thread_counts_ to stand for NumCPUs() // Special value placed in thread_counts_ to stand for NumCPUs()
static const int kNumCpuMarker = -1; static const int kNumCpuMarker = -1;
// Special value used to indicate that no range is required.
static const size_t kNoRangeIndex = std::numeric_limits<size_t>::max();
static const int kNoRange = std::numeric_limits<int>::max();
static void AddRange(std::vector<int>* dst, int lo, int hi, int mult); static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
static double MeasurePeakHeapMemory(const Instance& b);
static void RunInstance(const Instance& b, const BenchmarkReporter* br); friend class BenchmarkFamilies;
friend class ::benchmark::State;
friend struct ::benchmark::internal::Benchmark::Instance;
friend void ::benchmark::internal::RunMatchingBenchmarks(
const std::string&, const BenchmarkReporter*);
BENCHMARK_DISALLOW_COPY_AND_ASSIGN(Benchmark); BENCHMARK_DISALLOW_COPY_AND_ASSIGN(Benchmark);
}; };
// ------------------------------------------------------ // ------------------------------------------------------
// Internal implementation details follow; please ignore // Internal implementation details follow; please ignore
...@@ -487,16 +488,16 @@ class ConsoleReporter : public BenchmarkReporter { ...@@ -487,16 +488,16 @@ class ConsoleReporter : public BenchmarkReporter {
public: public:
virtual bool ReportContext(const Context& context) const; virtual bool ReportContext(const Context& context) const;
virtual void ReportRuns(const std::vector<Run>& reports) const; virtual void ReportRuns(const std::vector<Run>& reports) const;
private: private:
std::string PrintMemoryUsage(double bytes) const;
virtual void PrintRunData(const Run& report) const; virtual void PrintRunData(const Run& report) const;
// TODO(ericwf): Find a better way to share this information.
mutable size_t name_field_width_; mutable size_t name_field_width_;
}; };
} // end namespace internal } // end namespace internal
} // end namespace benchmark } // end namespace benchmark
// ------------------------------------------------------ // ------------------------------------------------------
// Macro to register benchmarks // Macro to register benchmarks
...@@ -534,4 +535,11 @@ class ConsoleReporter : public BenchmarkReporter { ...@@ -534,4 +535,11 @@ class ConsoleReporter : public BenchmarkReporter {
__benchmark_, n, __LINE__) BENCHMARK_UNUSED = \ __benchmark_, n, __LINE__) BENCHMARK_UNUSED = \
(new ::benchmark::internal::Benchmark(#n "<" #a "," #b ">", n<a, b>)) (new ::benchmark::internal::Benchmark(#n "<" #a "," #b ">", n<a, b>))
// Helper macro to create a main routine in a test that runs the benchmarks
#define BENCHMARK_MAIN() \
int main(int argc, const char** argv) { \
::benchmark::Initialize(&argc, argv); \
::benchmark::RunSpecifiedBenchmarks(); \
}
#endif // BENCHMARK_BENCHMARK_H_ #endif // BENCHMARK_BENCHMARK_H_
...@@ -2,9 +2,8 @@ ...@@ -2,9 +2,8 @@
include_directories(${PROJECT_SOURCE_DIR}/src) include_directories(${PROJECT_SOURCE_DIR}/src)
# Define the source files # Define the source files
set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc" set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc" "log.cc"
"log.cc" "sleep.cc" "string_util.cc" "sysinfo.cc" "sleep.cc" "string_util.cc" "sysinfo.cc" "walltime.cc")
"walltime.cc")
# Determine the correct regular expression engine to use # Determine the correct regular expression engine to use
if(HAVE_STD_REGEX) if(HAVE_STD_REGEX)
set(RE_FILES "re_std.cc") set(RE_FILES "re_std.cc")
......
...@@ -13,30 +13,30 @@ ...@@ -13,30 +13,30 @@
// limitations under the License. // limitations under the License.
#include "benchmark/benchmark.h" #include "benchmark/benchmark.h"
#include "arraysize.h"
#include "check.h"
#include "colorprint.h"
#include "commandlineflags.h"
#include "internal_macros.h"
#include "log.h"
#include "re.h"
#include "sleep.h"
#include "stat.h"
#include "string_util.h"
#include "sysinfo.h"
#include "walltime.h"
#include <sys/time.h> #include <sys/time.h>
#include <string.h> #include <sys/resource.h>
#include <unistd.h>
#include <cstdlib>
#include <cstring>
#include <algorithm> #include <algorithm>
#include <atomic> #include <atomic>
#include <condition_variable> #include <condition_variable>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <mutex>
#include <thread> #include <thread>
#include <sstream>
#include "check.h"
#include "commandlineflags.h"
#include "colorprint.h"
#include "log.h"
#include "mutex.h"
#include "re.h"
#include "stat.h"
#include "string_util.h"
#include "sysinfo.h"
#include "walltime.h"
DEFINE_string(benchmark_filter, ".", DEFINE_string(benchmark_filter, ".",
"A regular expression that specifies the set of benchmarks " "A regular expression that specifies the set of benchmarks "
...@@ -57,124 +57,121 @@ DEFINE_double(benchmark_min_time, 0.5, ...@@ -57,124 +57,121 @@ DEFINE_double(benchmark_min_time, 0.5,
"of the benchmark execution, regardless of number of " "of the benchmark execution, regardless of number of "
"threads."); "threads.");
DEFINE_bool(benchmark_memory_usage, false,
"Report memory usage for all benchmarks");
DEFINE_int32(benchmark_repetitions, 1, DEFINE_int32(benchmark_repetitions, 1,
"The number of runs of each benchmark. If greater than 1, the " "The number of runs of each benchmark. If greater than 1, the "
"mean and standard deviation of the runs will be reported."); "mean and standard deviation of the runs will be reported.");
DEFINE_int32(v, 0, "The level of verbose logging to output");
DEFINE_bool(color_print, true, "Enables colorized logging."); DEFINE_bool(color_print, true, "Enables colorized logging.");
// Will be non-empty if heap checking is turned on, which would DEFINE_int32(v, 0, "The level of verbose logging to output");
// invalidate any benchmarks.
DECLARE_string(heap_check);
// The ""'s catch people who don't pass in a literal for "str" // The ""'s catch people who don't pass in a literal for "str"
#define strliterallen(str) (sizeof("" str "") - 1) #define strliterallen(str) (sizeof("" str "") - 1)
// Must use a string literal for prefix. // Must use a string literal for prefix.
#define memprefix(str, len, prefix) \ #define memprefix(str, len, prefix) \
((((len) >= strliterallen(prefix)) && \ ((((len) >= strliterallen(prefix)) && \
memcmp(str, prefix, strliterallen(prefix)) == 0) \ std::memcmp(str, prefix, strliterallen(prefix)) == 0) \
? str + strliterallen(prefix) \ ? str + strliterallen(prefix) \
: NULL) : nullptr)
namespace benchmark { namespace benchmark {
namespace internal {
// NOTE: This is a dummy "mutex" type used to denote the actual mutex
// returned by GetBenchmarkLock(). This is only used to placate the thread
// safety warnings by giving the return of GetBenchmarkLock() a name.
struct CAPABILITY("mutex") BenchmarkLockType {};
BenchmarkLockType BenchmarkLockVar;
} // end namespace internal
inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar)
GetBenchmarkLock()
{
static Mutex lock;
return lock;
}
namespace { namespace {
// For non-dense Range, intermediate values are powers of kRangeMultiplier. // For non-dense Range, intermediate values are powers of kRangeMultiplier.
static const int kRangeMultiplier = 8; static const int kRangeMultiplier = 8;
static const int kMaxIterations = 1000000000;
std::mutex starting_mutex;
std::condition_variable starting_cv;
bool running_benchmark = false; bool running_benchmark = false;
// Should this benchmark report memory usage? // Global variable so that a benchmark can cause a little extra printing
bool get_memory_usage; std::string* GetReportLabel() {
static std::string label GUARDED_BY(GetBenchmarkLock());
return &label;
}
// Should this benchmark base decisions off of real time rather than // Should this benchmark base decisions off of real time rather than
// cpu time? // cpu time?
bool use_real_time; bool use_real_time GUARDED_BY(GetBenchmarkLock());
// Overhead of an empty benchmark.
double overhead = 0.0;
// Return prefix to print in front of each reported line
const char* Prefix() {
#ifdef NDEBUG
return "";
#else
return "DEBUG: ";
#endif
}
// TODO // TODO(ericwf): support MallocCounter.
// static internal::MallocCounter *benchmark_mc; //static benchmark::MallocCounter *benchmark_mc;
bool CpuScalingEnabled() { static bool CpuScalingEnabled() {
// On Linux, the CPUfreq subsystem exposes CPU information as files on the // On Linux, the CPUfreq subsystem exposes CPU information as files on the
// local file system. If reading the exported files fails, then we may not be // local file system. If reading the exported files fails, then we may not be
// running on Linux, so we silently ignore all the read errors. // running on Linux, so we silently ignore all the read errors.
for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) { for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
std::stringstream ss; std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu,
ss << "/sys/devices/system/cpu/cpu" << cpu << "/cpufreq/scaling_governor"; "/cpufreq/scaling_governor");
std::string governor_file = ss.str();
FILE* file = fopen(governor_file.c_str(), "r"); FILE* file = fopen(governor_file.c_str(), "r");
if (!file) break; if (!file) break;
char buff[16]; char buff[16];
size_t bytes_read = fread(buff, 1, sizeof(buff), file); size_t bytes_read = fread(buff, 1, sizeof(buff), file);
fclose(file); fclose(file);
if (memprefix(buff, bytes_read, "performance") == NULL) return true; if (memprefix(buff, bytes_read, "performance") == nullptr) return true;
} }
return false; return false;
} }
// Given a collection of reports, computes their mean and stddev.
// REQUIRES: all runs in "reports" must be from the same benchmark.
void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports, void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports,
BenchmarkReporter::Run* mean_data, BenchmarkReporter::Run* mean_data,
BenchmarkReporter::Run* stddev_data) { BenchmarkReporter::Run* stddev_data) {
CHECK(reports.size() >= 2) << "Cannot compute stats for less than 2 reports";
// Accumulators. // Accumulators.
Stat1_d real_accumulated_time_stat; Stat1_d real_accumulated_time_stat;
Stat1_d cpu_accumulated_time_stat; Stat1_d cpu_accumulated_time_stat;
Stat1_d items_per_second_stat;
Stat1_d bytes_per_second_stat; Stat1_d bytes_per_second_stat;
Stat1_d iterations_stat; Stat1_d items_per_second_stat;
Stat1MinMax_d max_heapbytes_used_stat; // All repetitions should be run with the same number of iterations so we
// can take this information from the first benchmark.
std::size_t const run_iterations = reports.front().iterations;
// Populate the accumulators. // Populate the accumulators.
for (std::vector<BenchmarkReporter::Run>::const_iterator it = reports.begin(); for (BenchmarkReporter::Run const& run : reports) {
it != reports.end(); ++it) { CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
CHECK_EQ(reports[0].benchmark_name, it->benchmark_name); CHECK_EQ(run_iterations, run.iterations);
real_accumulated_time_stat += real_accumulated_time_stat +=
Stat1_d(it->real_accumulated_time / it->iterations, it->iterations); Stat1_d(run.real_accumulated_time/run.iterations, run.iterations);
cpu_accumulated_time_stat += cpu_accumulated_time_stat +=
Stat1_d(it->cpu_accumulated_time / it->iterations, it->iterations); Stat1_d(run.cpu_accumulated_time/run.iterations, run.iterations);
items_per_second_stat += Stat1_d(it->items_per_second, it->iterations); items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
bytes_per_second_stat += Stat1_d(it->bytes_per_second, it->iterations); bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
iterations_stat += Stat1_d(it->iterations, it->iterations);
max_heapbytes_used_stat +=
Stat1MinMax_d(it->max_heapbytes_used, it->iterations);
} }
// Get the data from the accumulator to BenchmarkRunData's. In the // Get the data from the accumulator to BenchmarkReporter::Run's.
// computations below we must multiply by the number of iterations since
// PrintRunData will divide by it.
mean_data->benchmark_name = reports[0].benchmark_name + "_mean"; mean_data->benchmark_name = reports[0].benchmark_name + "_mean";
mean_data->iterations = iterations_stat.Mean(); mean_data->iterations = run_iterations;
mean_data->real_accumulated_time = real_accumulated_time_stat.Mean() * mean_data->real_accumulated_time = real_accumulated_time_stat.Mean() *
mean_data->iterations; run_iterations;
mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Mean() * mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Mean() *
mean_data->iterations; run_iterations;
mean_data->bytes_per_second = bytes_per_second_stat.Mean(); mean_data->bytes_per_second = bytes_per_second_stat.Mean();
mean_data->items_per_second = items_per_second_stat.Mean(); mean_data->items_per_second = items_per_second_stat.Mean();
mean_data->max_heapbytes_used = max_heapbytes_used_stat.Max();
// Only add label to mean/stddev if it is same for all runs // Only add label to mean/stddev if it is same for all runs
mean_data->report_label = reports[0].report_label; mean_data->report_label = reports[0].report_label;
for (size_t i = 1; i < reports.size(); i++) { for (std::size_t i = 1; i < reports.size(); i++) {
if (reports[i].report_label != reports[0].report_label) { if (reports[i].report_label != reports[0].report_label) {
mean_data->report_label = ""; mean_data->report_label = "";
break; break;
...@@ -183,29 +180,166 @@ void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports, ...@@ -183,29 +180,166 @@ void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports,
stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev"; stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev";
stddev_data->report_label = mean_data->report_label; stddev_data->report_label = mean_data->report_label;
stddev_data->iterations = iterations_stat.StdDev(); stddev_data->iterations = 0;
// The value of iterations_stat.StdDev() above may be 0 if all the repetitions stddev_data->real_accumulated_time =
// have the same number of iterations. Blindly multiplying by 0 in the real_accumulated_time_stat.StdDev();
// computation of real/cpu_accumulated_time below would lead to 0/0 in stddev_data->cpu_accumulated_time =
// PrintRunData. So we skip the multiplication in this case and PrintRunData cpu_accumulated_time_stat.StdDev();
// skips the division.
if (stddev_data->iterations == 0) {
stddev_data->real_accumulated_time = real_accumulated_time_stat.StdDev();
stddev_data->cpu_accumulated_time = cpu_accumulated_time_stat.StdDev();
} else {
stddev_data->real_accumulated_time = real_accumulated_time_stat.StdDev() *
stddev_data->iterations;
stddev_data->cpu_accumulated_time = cpu_accumulated_time_stat.StdDev() *
stddev_data->iterations;
}
stddev_data->bytes_per_second = bytes_per_second_stat.StdDev(); stddev_data->bytes_per_second = bytes_per_second_stat.StdDev();
stddev_data->items_per_second = items_per_second_stat.StdDev(); stddev_data->items_per_second = items_per_second_stat.StdDev();
stddev_data->max_heapbytes_used = max_heapbytes_used_stat.StdDev();
} }
} // namespace
struct ThreadStats {
ThreadStats() : bytes_processed(0), items_processed(0) {}
int64_t bytes_processed;
int64_t items_processed;
};
// Timer management class
class TimerManager {
public:
TimerManager(int num_threads, Notification* done)
: num_threads_(num_threads),
done_(done),
running_(false),
real_time_used_(0),
cpu_time_used_(0),
num_finalized_(0),
phase_number_(0),
entered_(0) {
}
// Called by each thread
void StartTimer() EXCLUDES(lock_) {
bool last_thread = false;
{
MutexLock ml(lock_);
last_thread = Barrier(ml);
if (last_thread) {
CHECK(!running_) << "Called StartTimer when timer is already running";
running_ = true;
start_real_time_ = walltime::Now();
start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage();
}
}
if (last_thread) {
phase_condition_.notify_all();
}
}
// Called by each thread
void StopTimer() EXCLUDES(lock_) {
bool last_thread = false;
{
MutexLock ml(lock_);
last_thread = Barrier(ml);
if (last_thread) {
CHECK(running_) << "Called StopTimer when timer is already stopped";
InternalStop();
}
}
if (last_thread) {
phase_condition_.notify_all();
}
}
// Called by each thread
void Finalize() EXCLUDES(lock_) {
MutexLock l(lock_);
num_finalized_++;
if (num_finalized_ == num_threads_) {
CHECK(!running_) <<
"The timer should be stopped before the timer is finalized";
done_->Notify();
}
}
// REQUIRES: timer is not running
double real_time_used() EXCLUDES(lock_) {
MutexLock l(lock_);
CHECK(!running_);
return real_time_used_;
}
// REQUIRES: timer is not running
double cpu_time_used() EXCLUDES(lock_) {
MutexLock l(lock_);
CHECK(!running_);
return cpu_time_used_;
}
private:
Mutex lock_;
Condition phase_condition_;
int num_threads_;
Notification* done_;
bool running_; // Is the timer running
double start_real_time_; // If running_
double start_cpu_time_; // If running_
// Accumulated time so far (does not contain current slice if running_)
double real_time_used_;
double cpu_time_used_;
// How many threads have called Finalize()
int num_finalized_;
// State for barrier management
int phase_number_;
int entered_; // Number of threads that have entered this barrier
void InternalStop() REQUIRES(lock_) {
CHECK(running_);
running_ = false;
real_time_used_ += walltime::Now() - start_real_time_;
cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage())
- start_cpu_time_);
}
// Enter the barrier and wait until all other threads have also
// entered the barrier. Returns iff this is the last thread to
// enter the barrier.
bool Barrier(MutexLock& ml) REQUIRES(lock_) {
CHECK_LT(entered_, num_threads_);
entered_++;
if (entered_ < num_threads_) {
// Wait for all threads to enter
int phase_number_cp = phase_number_;
auto cb = [this, phase_number_cp]() {
return this->phase_number_ > phase_number_cp;
};
phase_condition_.wait(ml.native_handle(), cb);
return false; // I was not the last one
} else {
// Last thread has reached the barrier
phase_number_++;
entered_ = 0;
return true;
}
}
};
// TimerManager for current run.
static std::unique_ptr<TimerManager> timer_manager = nullptr;
} // end namespace
namespace internal { namespace internal {
// Information kept per benchmark we may want to run
struct Benchmark::Instance {
std::string name;
Function* function;
bool has_arg1;
int arg1;
bool has_arg2;
int arg2;
int threads; // Number of concurrent threads to use
bool multithreaded; // Is benchmark multi-threaded?
};
// Class for managing registered benchmarks. Note that each registered // Class for managing registered benchmarks. Note that each registered
// benchmark identifies a family of related benchmarks to run. // benchmark identifies a family of related benchmarks to run.
class BenchmarkFamilies { class BenchmarkFamilies {
...@@ -220,16 +354,17 @@ class BenchmarkFamilies { ...@@ -220,16 +354,17 @@ class BenchmarkFamilies {
// Extract the list of benchmark instances that match the specified // Extract the list of benchmark instances that match the specified
// regular expression. // regular expression.
void FindBenchmarks(const std::string& re, bool FindBenchmarks(const std::string& re,
std::vector<Benchmark::Instance>* benchmarks); std::vector<Benchmark::Instance>* benchmarks);
private: private:
BenchmarkFamilies(); BenchmarkFamilies();
~BenchmarkFamilies(); ~BenchmarkFamilies();
std::vector<Benchmark*> families_; std::vector<Benchmark*> families_;
std::mutex mutex_; Mutex mutex_;
}; };
BenchmarkFamilies* BenchmarkFamilies::GetInstance() { BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
static BenchmarkFamilies instance; static BenchmarkFamilies instance;
return &instance; return &instance;
...@@ -244,7 +379,7 @@ BenchmarkFamilies::~BenchmarkFamilies() { ...@@ -244,7 +379,7 @@ BenchmarkFamilies::~BenchmarkFamilies() {
} }
size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) { size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) {
std::lock_guard<std::mutex> l(mutex_); MutexLock l(mutex_);
// This loop attempts to reuse an entry that was previously removed to avoid // This loop attempts to reuse an entry that was previously removed to avoid
// unncessary growth of the vector. // unncessary growth of the vector.
for (size_t index = 0; index < families_.size(); ++index) { for (size_t index = 0; index < families_.size(); ++index) {
...@@ -259,392 +394,133 @@ size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) { ...@@ -259,392 +394,133 @@ size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) {
} }
void BenchmarkFamilies::RemoveBenchmark(size_t index) { void BenchmarkFamilies::RemoveBenchmark(size_t index) {
std::lock_guard<std::mutex> l(mutex_); MutexLock l(mutex_);
families_[index] = NULL; families_[index] = nullptr;
// Don't shrink families_ here, we might be called by the destructor of // Don't shrink families_ here, we might be called by the destructor of
// BenchmarkFamilies which iterates over the vector. // BenchmarkFamilies which iterates over the vector.
} }
void BenchmarkFamilies::FindBenchmarks( bool BenchmarkFamilies::FindBenchmarks(
const std::string& spec, const std::string& spec,
std::vector<Benchmark::Instance>* benchmarks) { std::vector<Benchmark::Instance>* benchmarks) {
// Make regular expression out of command-line flag // Make regular expression out of command-line flag
std::string error_msg;
Regex re; Regex re;
std::string re_error; if (!re.Init(spec, &error_msg)) {
if (!re.Init(spec, &re_error)) { std::cerr << "Could not compile benchmark re: " << error_msg << std::endl;
std::cerr << "Could not compile benchmark re: " << re_error << std::endl; return false;
return;
} }
std::lock_guard<std::mutex> l(mutex_); // Special list of thread counts to use when none are specified
for (internal::Benchmark* family : families_) { std::vector<int> one_thread;
if (family == nullptr) continue; // Family was deleted one_thread.push_back(1);
MutexLock l(mutex_);
for (Benchmark* family : families_) {
// Family was deleted or benchmark doesn't match
if (family == nullptr || !re.Match(family->name_)) continue;
// Match against filter. if (family->arg_count_ == -1) {
if (!re.Match(family->name_)) { family->arg_count_ = 0;
VLOG(1) << "Skipping " << family->name_ << "\n"; family->args_.emplace_back(-1, -1);
continue;
} }
for (auto const& args : family->args_) {
const std::vector<int>* thread_counts =
(family->thread_counts_.empty()
? &one_thread
: &family->thread_counts_);
for (int num_threads : *thread_counts) {
Benchmark::Instance instance;
instance.name = family->name_;
instance.function = family->function_;
instance.has_arg1 = family->arg_count_ >= 1;
instance.arg1 = args.first;
instance.has_arg2 = family->arg_count_ == 2;
instance.arg2 = args.second;
instance.threads = num_threads;
instance.multithreaded = !(family->thread_counts_.empty());
// Add arguments to instance name
if (family->arg_count_ >= 1) {
AppendHumanReadable(instance.arg1, &instance.name);
}
if (family->arg_count_ >= 2) {
AppendHumanReadable(instance.arg2, &instance.name);
}
std::vector<Benchmark::Instance> instances; // Add the number of threads used to the name
if (family->rangeX_.empty() && family->rangeY_.empty()) { if (!family->thread_counts_.empty()) {
instances = family->CreateBenchmarkInstances( instance.name += StringPrintF("/threads:%d", instance.threads);
Benchmark::kNoRangeIndex, Benchmark::kNoRangeIndex);
std::copy(instances.begin(), instances.end(),
std::back_inserter(*benchmarks));
} else if (family->rangeY_.empty()) {
for (size_t x = 0; x < family->rangeX_.size(); ++x) {
instances = family->CreateBenchmarkInstances(
x, Benchmark::kNoRangeIndex);
std::copy(instances.begin(), instances.end(),
std::back_inserter(*benchmarks));
}
} else {
for (size_t x = 0; x < family->rangeX_.size(); ++x) {
for (size_t y = 0; y < family->rangeY_.size(); ++y) {
instances = family->CreateBenchmarkInstances(x, y);
std::copy(instances.begin(), instances.end(),
std::back_inserter(*benchmarks));
} }
benchmarks->push_back(instance);
} }
} }
} }
}
std::string ConsoleReporter::PrintMemoryUsage(double bytes) const {
if (!get_memory_usage || bytes < 0.0) return "";
std::stringstream ss;
ss << " " << HumanReadableNumber(bytes) << "B peak-mem";
return ss.str();
}
bool ConsoleReporter::ReportContext(const BenchmarkReporter::Context& context)
const {
name_field_width_ = context.name_field_width;
std::cout << "Benchmarking on " << context.num_cpus << " X "
<< context.mhz_per_cpu << " MHz CPU"
<< ((context.num_cpus > 1) ? "s" : "") << "\n";
int remainder_ms;
std::cout << walltime::Print(walltime::Now(), "%Y/%m/%d-%H:%M:%S",
true, // use local timezone
&remainder_ms) << "\n";
// Show details of CPU model, caches, TLBs etc.
// if (!context.cpu_info.empty())
// std::cout << "CPU: " << context.cpu_info.c_str();
if (context.cpu_scaling_enabled) {
std::cerr << "CPU scaling is enabled: Benchmark timings may be noisy.\n";
}
int output_width = fprintf(stdout, "%s%-*s %10s %10s %10s\n",
Prefix(), int(name_field_width_), "Benchmark",
"Time(ns)", "CPU(ns)", "Iterations");
std::cout << std::string(output_width - 1, '-').c_str() << "\n";
return true; return true;
} }
void ConsoleReporter::ReportRuns(
const std::vector<BenchmarkReporter::Run>& reports) const {
for (std::vector<BenchmarkReporter::Run>::const_iterator it = reports.begin();
it != reports.end(); ++it) {
CHECK_EQ(reports[0].benchmark_name, it->benchmark_name);
PrintRunData(*it);
}
// We don't report aggregated data if there was a single run.
if (reports.size() < 2) return;
BenchmarkReporter::Run mean_data;
BenchmarkReporter::Run stddev_data;
ComputeStats(reports, &mean_data, &stddev_data);
PrintRunData(mean_data);
PrintRunData(stddev_data);
}
void ConsoleReporter::PrintRunData(const BenchmarkReporter::Run& result) const {
// Format bytes per second
std::string rate;
if (result.bytes_per_second > 0) {
std::stringstream ss;
ss << " " << HumanReadableNumber(result.bytes_per_second) << "B/s";
rate = ss.str();
}
// Format items per second
std::string items;
if (result.items_per_second > 0) {
std::stringstream ss;
ss << " " << HumanReadableNumber(result.items_per_second) << " items/s";
items = ss.str();
}
ColorPrintf(COLOR_DEFAULT, "%s", Prefix());
ColorPrintf(COLOR_GREEN, "%-*s ",
name_field_width_, result.benchmark_name.c_str());
if (result.iterations == 0) {
ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
result.real_accumulated_time * 1e9,
result.cpu_accumulated_time * 1e9);
} else {
ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
(result.real_accumulated_time * 1e9) /
(static_cast<double>(result.iterations)),
(result.cpu_accumulated_time * 1e9) /
(static_cast<double>(result.iterations)));
}
ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
ColorPrintf(COLOR_DEFAULT, "%*s %*s %s %s\n",
13, rate.c_str(),
18, items.c_str(),
result.report_label.c_str(),
PrintMemoryUsage(result.max_heapbytes_used).c_str());
}
/* TODO(dominic)
void MemoryUsage() {
// if (benchmark_mc) {
// benchmark_mc->Reset();
//} else {
get_memory_usage = true;
//}
}
*/
void PrintUsageAndExit() {
fprintf(stdout,
"benchmark [--benchmark_filter=<regex>]\n"
" [--benchmark_iterations=<iterations>]\n"
" [--benchmark_min_time=<min_time>]\n"
//" [--benchmark_memory_usage]\n"
" [--benchmark_repetitions=<num_repetitions>]\n"
" [--color_print={true|false}]\n"
" [--v=<verbosity>]\n");
exit(0);
}
void ParseCommandLineFlags(int* argc, const char** argv) {
for (int i = 1; i < *argc; ++i) {
if (ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
ParseInt32Flag(argv[i], "benchmark_iterations",
&FLAGS_benchmark_iterations) ||
ParseDoubleFlag(argv[i], "benchmark_min_time",
&FLAGS_benchmark_min_time) ||
// TODO(dominic)
// ParseBoolFlag(argv[i], "gbenchmark_memory_usage",
// &FLAGS_gbenchmark_memory_usage) ||
ParseInt32Flag(argv[i], "benchmark_repetitions",
&FLAGS_benchmark_repetitions) ||
ParseBoolFlag(argv[i], "color_print", &FLAGS_color_print) ||
ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
--(*argc);
--i;
} else if (IsFlag(argv[i], "help"))
PrintUsageAndExit();
}
}
} // end namespace internal
// A clock that provides a fast mechanism to check if we're nearly done.
class State::FastClock {
public:
enum Type {
REAL_TIME,
CPU_TIME
};
explicit FastClock(Type type)
: type_(type),
approx_time_(NowMicros()),
bg_done_(false),
bg_(BGThreadWrapper, this) { }
~FastClock() {
{
std::unique_lock<std::mutex> l(bg_mutex_);
bg_done_ = true;
bg_cond_.notify_one();
}
bg_.join();
}
// Returns true if the current time is guaranteed to be past "when_micros".
// This method is very fast.
inline bool HasReached(int64_t when_micros) {
return std::atomic_load(&approx_time_) >= when_micros;
}
// Returns the current time in microseconds past the epoch.
int64_t NowMicros() const {
double t = 0;
switch (type_) {
case REAL_TIME:
t = walltime::Now();
break;
case CPU_TIME:
t = MyCPUUsage() + ChildrenCPUUsage();
break;
}
return static_cast<int64_t>(t * kNumMicrosPerSecond);
}
// Reinitialize if necessary (since clock type may be change once benchmark
// function starts running - see UseRealTime).
void InitType(Type type) {
type_ = type;
std::lock_guard<std::mutex> l(bg_mutex_);
std::atomic_store(&approx_time_, NowMicros());
}
private:
Type type_;
std::atomic<int64_t> approx_time_; // Last time measurement taken by bg_
bool bg_done_; // This is used to signal background thread to exit
std::mutex bg_mutex_;
std::condition_variable bg_cond_;
std::thread bg_; // Background thread that updates last_time_ once every ms
static void* BGThreadWrapper(void* that) {
((FastClock*)that)->BGThread();
return NULL;
}
void BGThread() {
std::unique_lock<std::mutex> l(bg_mutex_);
while (!bg_done_)
{
// Set timeout to 1 ms.
bg_cond_.wait_for(l, std::chrono::milliseconds(1));
std::atomic_store(&approx_time_, NowMicros());
}
}
BENCHMARK_DISALLOW_COPY_AND_ASSIGN(FastClock);
};
struct State::ThreadStats {
int64_t bytes_processed;
int64_t items_processed;
ThreadStats() { Reset(); }
void Reset() {
bytes_processed = 0;
items_processed = 0;
}
void Add(const ThreadStats& other) {
bytes_processed += other.bytes_processed;
items_processed += other.items_processed;
}
};
namespace internal {
// Information kept per benchmark we may want to run
struct Benchmark::Instance {
Instance()
: bm(nullptr),
threads(1),
rangeXset(false),
rangeX(kNoRange),
rangeYset(false),
rangeY(kNoRange) {}
std::string name;
Benchmark* bm;
int threads; // Number of concurrent threads to use
bool rangeXset;
int rangeX;
bool rangeYset;
int rangeY;
bool multithreaded() const { return !bm->thread_counts_.empty(); }
};
} // end namespace internal
struct State::SharedState {
const internal::Benchmark::Instance* instance;
std::mutex mu;
std::condition_variable cond;
int starting; // Number of threads that have entered STARTING state
int stopping; // Number of threads that have entered STOPPING state
int exited; // Number of threads that have complete exited
int threads; // Number of total threads that are running concurrently
ThreadStats stats;
std::vector<BenchmarkReporter::Run> runs; // accumulated runs
std::string label;
explicit SharedState(const internal::Benchmark::Instance* b)
: instance(b),
starting(0),
stopping(0),
exited(0),
threads(b == nullptr ? 1 : b->threads) { }
BENCHMARK_DISALLOW_COPY_AND_ASSIGN(SharedState);
};
namespace internal { Benchmark::Benchmark(const char* name,
Function* f)
Benchmark::Benchmark(const char* name, BenchmarkFunction f) : name_(name), function_(f), arg_count_(-1) {
: name_(name), function_(f) {
registration_index_ = BenchmarkFamilies::GetInstance()->AddBenchmark(this); registration_index_ = BenchmarkFamilies::GetInstance()->AddBenchmark(this);
} }
Benchmark::~Benchmark() { Benchmark::~Benchmark() {
BenchmarkFamilies::GetInstance()->RemoveBenchmark(registration_index_); BenchmarkFamilies::GetInstance()->RemoveBenchmark(registration_index_);
} }
Benchmark* Benchmark::Arg(int x) { Benchmark* Benchmark::Arg(int x) {
std::lock_guard<std::mutex> l(mutex_); CHECK(arg_count_ == -1 || arg_count_ == 1);
rangeX_.push_back(x); arg_count_ = 1;
args_.emplace_back(x, -1);
return this; return this;
} }
Benchmark* Benchmark::Range(int start, int limit) { Benchmark* Benchmark::Range(int start, int limit) {
CHECK(arg_count_ == -1 || arg_count_ == 1);
arg_count_ = 1;
std::vector<int> arglist; std::vector<int> arglist;
AddRange(&arglist, start, limit, kRangeMultiplier); AddRange(&arglist, start, limit, kRangeMultiplier);
std::lock_guard<std::mutex> l(mutex_); for (int i : arglist) {
for (size_t i = 0; i < arglist.size(); ++i) rangeX_.push_back(arglist[i]); args_.emplace_back(i, -1);
}
return this; return this;
} }
Benchmark* Benchmark::DenseRange(int start, int limit) { Benchmark* Benchmark::DenseRange(int start, int limit) {
CHECK(arg_count_ == -1 || arg_count_ == 1);
arg_count_ = 1;
CHECK_GE(start, 0); CHECK_GE(start, 0);
CHECK_LE(start, limit); CHECK_LE(start, limit);
std::lock_guard<std::mutex> l(mutex_); for (int arg = start; arg <= limit; arg++) {
for (int arg = start; arg <= limit; ++arg) rangeX_.push_back(arg); args_.emplace_back(arg, -1);
}
return this; return this;
} }
Benchmark* Benchmark::ArgPair(int x, int y) { Benchmark* Benchmark::ArgPair(int x, int y) {
std::lock_guard<std::mutex> l(mutex_); CHECK(arg_count_ == -1 || arg_count_ == 2);
rangeX_.push_back(x); arg_count_ = 2;
rangeY_.push_back(y); args_.emplace_back(x, y);
return this; return this;
} }
Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) { Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
CHECK(arg_count_ == -1 || arg_count_ == 2);
arg_count_ = 2;
std::vector<int> arglist1, arglist2; std::vector<int> arglist1, arglist2;
AddRange(&arglist1, lo1, hi1, kRangeMultiplier); AddRange(&arglist1, lo1, hi1, kRangeMultiplier);
AddRange(&arglist2, lo2, hi2, kRangeMultiplier); AddRange(&arglist2, lo2, hi2, kRangeMultiplier);
std::lock_guard<std::mutex> l(mutex_); for (int i : arglist1) {
rangeX_.resize(arglist1.size()); for (int j : arglist2) {
std::copy(arglist1.begin(), arglist1.end(), rangeX_.begin()); args_.emplace_back(i, j);
rangeY_.resize(arglist2.size()); }
std::copy(arglist2.begin(), arglist2.end(), rangeY_.begin()); }
return this; return this;
} }
...@@ -655,7 +531,6 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) { ...@@ -655,7 +531,6 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
Benchmark* Benchmark::Threads(int t) { Benchmark* Benchmark::Threads(int t) {
CHECK_GT(t, 0); CHECK_GT(t, 0);
std::lock_guard<std::mutex> l(mutex_);
thread_counts_.push_back(t); thread_counts_.push_back(t);
return this; return this;
} }
...@@ -664,14 +539,13 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) { ...@@ -664,14 +539,13 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
CHECK_GT(min_threads, 0); CHECK_GT(min_threads, 0);
CHECK_GE(max_threads, min_threads); CHECK_GE(max_threads, min_threads);
std::lock_guard<std::mutex> l(mutex_);
AddRange(&thread_counts_, min_threads, max_threads, 2); AddRange(&thread_counts_, min_threads, max_threads, 2);
return this; return this;
} }
Benchmark* Benchmark::ThreadPerCpu() { Benchmark* Benchmark::ThreadPerCpu() {
std::lock_guard<std::mutex> l(mutex_); static int num_cpus = NumCPUs();
thread_counts_.push_back(NumCPUs()); thread_counts_.push_back(num_cpus);
return this; return this;
} }
...@@ -682,443 +556,310 @@ void Benchmark::AddRange(std::vector<int>* dst, int lo, int hi, int mult) { ...@@ -682,443 +556,310 @@ void Benchmark::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
// Add "lo" // Add "lo"
dst->push_back(lo); dst->push_back(lo);
static const int kint32max = std::numeric_limits<int32_t>::max();
// Now space out the benchmarks in multiples of "mult" // Now space out the benchmarks in multiples of "mult"
for (int32_t i = 1; i < std::numeric_limits<int32_t>::max() / mult; for (int32_t i = 1; i < kint32max/mult; i *= mult) {
i *= mult) {
if (i >= hi) break; if (i >= hi) break;
if (i > lo) dst->push_back(i); if (i > lo) {
dst->push_back(i);
}
} }
// Add "hi" (if different from "lo") // Add "hi" (if different from "lo")
if (hi != lo) dst->push_back(hi); if (hi != lo) {
dst->push_back(hi);
}
} }
std::vector<Benchmark::Instance> Benchmark::CreateBenchmarkInstances( } // end namespace internal
size_t rangeXindex, size_t rangeYindex) {
// Special list of thread counts to use when none are specified
std::vector<int> one_thread;
one_thread.push_back(1);
std::vector<Benchmark::Instance> instances; namespace {
const bool is_multithreaded = (!thread_counts_.empty());
const std::vector<int>& thread_counts =
(is_multithreaded ? thread_counts_ : one_thread);
for (int num_threads : thread_counts) {
Instance instance;
instance.name = name_;
instance.bm = this;
instance.threads = num_threads;
if (rangeXindex != kNoRangeIndex) {
instance.rangeX = rangeX_[rangeXindex];
instance.rangeXset = true;
AppendHumanReadable(instance.rangeX, &instance.name);
}
if (rangeYindex != kNoRangeIndex) {
instance.rangeY = rangeY_[rangeYindex];
instance.rangeYset = true;
AppendHumanReadable(instance.rangeY, &instance.name);
}
// Add the number of threads used to the name
if (is_multithreaded) {
std::stringstream ss;
ss << "/threads:" << instance.threads;
instance.name += ss.str();
}
instances.push_back(instance); // Execute one thread of benchmark b for the specified number of iterations.
// Adds the stats collected for the thread into *total.
void RunInThread(const benchmark::internal::Benchmark::Instance* b,
int iters, int thread_id,
ThreadStats* total) EXCLUDES(GetBenchmarkLock()) {
State st(iters, b->has_arg1, b->arg1, b->has_arg2, b->arg2, thread_id);
b->function(st);
CHECK(st.iterations() == st.max_iterations) <<
"Benchmark returned before State::KeepRunning() returned false!";
{
MutexLock l(GetBenchmarkLock());
total->bytes_processed += st.bytes_processed();
total->items_processed += st.items_processed();
} }
return instances; timer_manager->Finalize();
} }
void Benchmark::MeasureOverhead() { void RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
State::FastClock clock(State::FastClock::CPU_TIME); const BenchmarkReporter* br) EXCLUDES(GetBenchmarkLock()) {
State::SharedState state(nullptr); int iters = FLAGS_benchmark_iterations ? FLAGS_benchmark_iterations
State runner(&clock, &state, 0); : 1;
while (runner.KeepRunning()) { std::vector<BenchmarkReporter::Run> reports;
}
overhead = state.runs[0].real_accumulated_time /
static_cast<double>(state.runs[0].iterations);
VLOG(1) << "Per-iteration overhead for doing nothing: " << overhead << "\n";
}
void Benchmark::RunInstance(const Instance& b, const BenchmarkReporter* br) { std::vector<std::thread> pool;
use_real_time = false; if (b.multithreaded)
running_benchmark = true; pool.resize(b.threads);
// get_memory_usage = FLAGS_gbenchmark_memory_usage;
State::FastClock clock(State::FastClock::CPU_TIME); for (int i = 0; i < FLAGS_benchmark_repetitions; i++) {
std::string mem;
while (true) {
// Try benchmark
VLOG(2) << "Running " << b.name << " for " << iters << "\n";
// Initialize the test runners.
State::SharedState state(&b);
{
std::vector<std::unique_ptr<State>> runners;
for (int i = 0; i < b.threads; ++i)
runners.push_back(std::unique_ptr<State>(new State(&clock, &state, i)));
// Run them all.
for (int i = 0; i < b.threads; ++i) {
if (b.multithreaded())
runners[i]->RunAsThread();
else
runners[i]->Run();
}
if (b.multithreaded()) {
for (int i = 0; i < b.threads; ++i) runners[i]->Wait();
}
}
/*
double mem_usage = 0;
if (get_memory_usage) {
// Measure memory usage
Notification mem_done;
BenchmarkRun mem_run;
BenchmarkRun::SharedState mem_shared(&b, 1);
mem_run.Init(&clock, &mem_shared, 0);
{ {
testing::MallocCounter mc(testing::MallocCounter::THIS_THREAD_ONLY); MutexLock l(GetBenchmarkLock());
benchmark_mc = &mc; GetReportLabel()->clear();
mem_run.Run(&mem_done); use_real_time = false;
mem_done.WaitForNotification();
benchmark_mc = NULL;
mem_usage = mc.PeakHeapGrowth();
} }
}
*/
running_benchmark = false;
for (BenchmarkReporter::Run& report : state.runs) {
double seconds = (use_real_time ? report.real_accumulated_time
: report.cpu_accumulated_time);
report.benchmark_name = b.name;
report.report_label = state.label;
report.bytes_per_second = state.stats.bytes_processed / seconds;
report.items_per_second = state.stats.items_processed / seconds;
report.max_heapbytes_used = MeasurePeakHeapMemory(b);
}
br->ReportRuns(state.runs); Notification done;
} timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done));
ThreadStats total;
running_benchmark = true;
if (b.multithreaded) {
// If this is out first iteration of the while(true) loop then the
// threads haven't been started and can't be joined. Otherwise we need
// to join the thread before replacing them.
for (std::thread& thread : pool) {
if (thread.joinable())
thread.join();
}
for (std::size_t ti = 0; ti < pool.size(); ++ti) {
pool[ti] = std::thread(&RunInThread, &b, iters, ti, &total);
}
} else {
// Run directly in this thread
RunInThread(&b, iters, 0, &total);
}
done.WaitForNotification();
running_benchmark = false;
// Run the specified benchmark, measure its peak memory usage, and const double cpu_accumulated_time = timer_manager->cpu_time_used();
// return the peak memory usage. const double real_accumulated_time = timer_manager->real_time_used();
double Benchmark::MeasurePeakHeapMemory(const Instance&) { timer_manager.reset();
if (!get_memory_usage) return 0.0;
double bytes = 0.0;
/* TODO(dominich)
// Should we do multi-threaded runs?
const int num_threads = 1;
const int num_iters = 1;
{
// internal::MallocCounter mc(internal::MallocCounter::THIS_THREAD_ONLY);
running_benchmark = true;
timer_manager = new TimerManager(1, NULL);
// benchmark_mc = &mc;
timer_manager->StartTimer();
b.Run(num_iters);
running_benchmark = false;
delete timer_manager;
timer_manager = NULL;
// benchmark_mc = NULL;
// bytes = mc.PeakHeapGrowth();
}
*/
return bytes;
}
} // end namespace internal VLOG(2) << "Ran in " << cpu_accumulated_time << "/"
<< real_accumulated_time << "\n";
State::State(FastClock* clock, SharedState* s, int t)
: thread_index(t),
state_(STATE_INITIAL),
clock_(clock),
shared_(s),
iterations_(0),
start_cpu_(0.0),
start_time_(0.0),
stop_time_micros_(0.0),
start_pause_cpu_(0.0),
pause_cpu_time_(0.0),
start_pause_real_(0.0),
pause_real_time_(0.0),
total_iterations_(0),
interval_micros_(static_cast<int64_t>(kNumMicrosPerSecond *
FLAGS_benchmark_min_time /
FLAGS_benchmark_repetitions)),
is_continuation_(false),
stats_(new ThreadStats()) {
CHECK(clock != nullptr);
CHECK(s != nullptr);
}
bool State::KeepRunning() { // Base decisions off of real time if requested by this benchmark.
// Fast path double seconds = cpu_accumulated_time;
if ((FLAGS_benchmark_iterations == 0 && std::string label;
!clock_->HasReached(stop_time_micros_ + {
kNumMicrosPerSecond * pause_real_time_)) || MutexLock l(GetBenchmarkLock());
iterations_ < FLAGS_benchmark_iterations) { label = *GetReportLabel();
++iterations_; if (use_real_time) {
return true; seconds = real_accumulated_time;
} }
}
// To block thread 0 until all other threads exit, we have a signal exit // If this was the first run, was elapsed time or cpu time large enough?
// point for KeepRunning() to return false. The fast path above always // If this is not the first run, go with the current value of iter.
// returns true. if ((i > 0) ||
bool ret = false; (iters == FLAGS_benchmark_iterations) ||
switch (state_) { (iters >= kMaxIterations) ||
case STATE_INITIAL: (seconds >= FLAGS_benchmark_min_time) ||
ret = StartRunning(); (real_accumulated_time >= 5*FLAGS_benchmark_min_time)) {
break; double bytes_per_second = 0;
case STATE_STARTING: if (total.bytes_processed > 0 && seconds != 0.0) {
CHECK(false); bytes_per_second = (total.bytes_processed / seconds);
ret = true; }
break; double items_per_second = 0;
case STATE_RUNNING: if (total.items_processed > 0 && seconds != 0.0) {
ret = FinishInterval(); items_per_second = (total.items_processed / seconds);
break; }
case STATE_STOPPING:
ret = MaybeStop();
break;
case STATE_STOPPED:
CHECK(false);
ret = true;
break;
}
if (!ret && shared_->threads > 1 && thread_index == 0){ // Create report about this benchmark run.
std::unique_lock<std::mutex> l(shared_->mu); BenchmarkReporter::Run report;
report.benchmark_name = b.name;
report.report_label = label;
// Report the total iterations across all threads.
report.iterations = static_cast<int64_t>(iters) * b.threads;
report.real_accumulated_time = real_accumulated_time;
report.cpu_accumulated_time = cpu_accumulated_time;
report.bytes_per_second = bytes_per_second;
report.items_per_second = items_per_second;
reports.push_back(report);
break;
}
// Block until all other threads have exited. We can then safely cleanup // See how much iterations should be increased by
// without other threads continuing to access shared variables inside the // Note: Avoid division by zero with max(seconds, 1ns).
// user-provided run function. double multiplier = FLAGS_benchmark_min_time * 1.4 / std::max(seconds, 1e-9);
while (shared_->exited < shared_->threads - 1) { // If our last run was at least 10% of FLAGS_benchmark_min_time then we
shared_->cond.wait(l); // use the multiplier directly. Otherwise we use at most 10 times
// expansion.
// NOTE: When the last run was at least 10% of the min time the max
// expansion should be 14x.
bool is_significant = (seconds / FLAGS_benchmark_min_time) > 0.1;
multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
if (multiplier <= 1.0) multiplier = 2.0;
double next_iters = std::max(multiplier * iters, iters + 1.0);
if (next_iters > kMaxIterations) {
next_iters = kMaxIterations;
}
VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
iters = static_cast<int>(next_iters + 0.5);
} }
} }
br->ReportRuns(reports);
if (ret) { if (b.multithreaded) {
++iterations_; for (std::thread& thread : pool)
thread.join();
} }
return ret; }
} // namespace
State::State(size_t max_iters, bool has_x, int x, bool has_y, int y,
int thread_i)
: started_(false), total_iterations_(0),
has_range_x_(has_x), range_x_(x),
has_range_y_(has_y), range_y_(y),
bytes_processed_(0), items_processed_(0),
thread_index(thread_i),
max_iterations(max_iters)
{
CHECK(max_iterations != 0) << "At least one iteration must be run";
} }
void State::PauseTiming() { void State::PauseTiming() {
start_pause_cpu_ = MyCPUUsage() + ChildrenCPUUsage(); // Add in time accumulated so far
start_pause_real_ = walltime::Now(); CHECK(running_benchmark);
timer_manager->StopTimer();
} }
void State::ResumeTiming() { void State::ResumeTiming() {
pause_cpu_time_ += MyCPUUsage() + ChildrenCPUUsage() - start_pause_cpu_; CHECK(running_benchmark);
pause_real_time_ += walltime::Now() - start_pause_real_; timer_manager->StartTimer();
} }
void State::SetBytesProcessed(int64_t bytes) { void State::UseRealTime() {
CHECK_EQ(STATE_STOPPED, state_); MutexLock l(GetBenchmarkLock());
std::lock_guard<std::mutex> l(shared_->mu); use_real_time = true;
stats_->bytes_processed = bytes;
} }
void State::SetItemsProcessed(int64_t items) { void State::SetLabel(const char* label) {
CHECK_EQ(STATE_STOPPED, state_); CHECK(running_benchmark);
std::lock_guard<std::mutex> l(shared_->mu); MutexLock l(GetBenchmarkLock());
stats_->items_processed = items; *GetReportLabel() = label;
} }
void State::SetLabel(const std::string& label) { BenchmarkReporter::~BenchmarkReporter() {}
CHECK_EQ(STATE_STOPPED, state_);
std::lock_guard<std::mutex> l(shared_->mu);
shared_->label = label;
}
int State::range_x() const { namespace internal {
CHECK(shared_->instance->rangeXset);
/*
<<
"Failed to get range_x as it was not set. Did you register your "
"benchmark with a range parameter?";
*/
return shared_->instance->rangeX;
}
int State::range_y() const { bool ConsoleReporter::ReportContext(const Context& context) const {
CHECK(shared_->instance->rangeYset); name_field_width_ = context.name_field_width;
/* <<
"Failed to get range_y as it was not set. Did you register your "
"benchmark with a range parameter?";
*/
return shared_->instance->rangeY;
}
bool State::StartRunning() { fprintf(stdout,
bool last_thread = false; "Run on (%d X %0.0f MHz CPU%s)\n",
{ context.num_cpus,
std::lock_guard<std::mutex> l(shared_->mu); context.mhz_per_cpu,
CHECK_EQ(state_, STATE_INITIAL); (context.num_cpus > 1) ? "s" : "");
state_ = STATE_STARTING;
is_continuation_ = false; int remainder_us;
CHECK_LT(shared_->starting, shared_->threads); std::string walltime_str = walltime::Print(
++shared_->starting; walltime::Now(), "%Y/%m/%d-%H:%M:%S",
last_thread = shared_->starting == shared_->threads; true, // use local timezone
} &remainder_us);
fprintf(stdout, "%s\n", walltime_str.c_str());
if (last_thread) { if (context.cpu_scaling_enabled) {
clock_->InitType(use_real_time ? FastClock::REAL_TIME fprintf(stdout, "***WARNING*** CPU scaling is enabled, the benchmark "
: FastClock::CPU_TIME); "timings may be noisy\n");
{
std::lock_guard<std::mutex> l(starting_mutex);
starting_cv.notify_all();
}
} else {
std::unique_lock<std::mutex> l(starting_mutex);
starting_cv.wait(l);
} }
CHECK_EQ(state_, STATE_STARTING);
state_ = STATE_RUNNING;
NewInterval(); #ifndef NDEBUG
fprintf(stdout, "Build Type: DEBUG\n");
#endif
int output_width =
fprintf(stdout,
"%-*s %10s %10s %10s\n",
static_cast<int>(name_field_width_),
"Benchmark",
"Time(ns)", "CPU(ns)",
"Iterations");
fprintf(stdout, "%s\n", std::string(output_width - 1, '-').c_str());
return true; return true;
} }
void State::NewInterval() { void ConsoleReporter::ReportRuns(
stop_time_micros_ = clock_->NowMicros() + interval_micros_; const std::vector<Run>& reports) const {
if (!is_continuation_) { if (reports.empty()) {
VLOG(1) << "Starting new interval; stopping in " << interval_micros_ return;
<< "\n";
iterations_ = 0;
pause_cpu_time_ = 0;
pause_real_time_ = 0;
start_cpu_ = MyCPUUsage() + ChildrenCPUUsage();
start_time_ = walltime::Now();
} else {
VLOG(1) << "Continuing interval; stopping in " << interval_micros_
<< "\n";
} }
}
bool State::FinishInterval() { for (Run const& run : reports) {
if ((FLAGS_benchmark_iterations != 0 && CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
iterations_ < PrintRunData(run);
FLAGS_benchmark_iterations / FLAGS_benchmark_repetitions) ||
iterations_ < 1) {
interval_micros_ *= 2;
VLOG(1) << "Not enough iterations in interval; "
<< "Trying again for " << interval_micros_ << " useconds.\n";
is_continuation_ = false;
NewInterval();
return true;
} }
BenchmarkReporter::Run data; if (reports.size() < 2) {
data.iterations = iterations_; // We don't report aggregated data if there was a single run.
data.thread_index = thread_index; return;
const double accumulated_time = walltime::Now() - start_time_;
const double total_overhead = overhead * iterations_;
CHECK_LT(pause_real_time_, accumulated_time);
CHECK_LT(pause_real_time_ + total_overhead, accumulated_time);
data.real_accumulated_time =
accumulated_time - (pause_real_time_ + total_overhead);
data.cpu_accumulated_time = (MyCPUUsage() + ChildrenCPUUsage()) -
(pause_cpu_time_ + start_cpu_);
total_iterations_ += iterations_;
bool keep_going = false;
{
std::lock_guard<std::mutex> l(shared_->mu);
// Either replace the last or add a new data point.
if (is_continuation_)
shared_->runs.back() = data;
else
shared_->runs.push_back(data);
if (FLAGS_benchmark_iterations != 0) {
// If we need more iterations, run another interval as a continuation.
keep_going = total_iterations_ < FLAGS_benchmark_iterations;
is_continuation_ = keep_going;
} else {
// If this is a repetition, run another interval as a new data point.
keep_going = shared_->runs.size() <
static_cast<size_t>(FLAGS_benchmark_repetitions);
is_continuation_ = !keep_going;
}
if (!keep_going) {
++shared_->stopping;
if (shared_->stopping < shared_->threads) {
// Other threads are still running, so continue running but without
// timing to present an expected background load to the other threads.
state_ = STATE_STOPPING;
keep_going = true;
} else {
state_ = STATE_STOPPED;
}
}
} }
if (state_ == STATE_RUNNING) NewInterval(); Run mean_data;
return keep_going; Run stddev_data;
} ComputeStats(reports, &mean_data, &stddev_data);
bool State::MaybeStop() { // Output using PrintRun.
std::lock_guard<std::mutex> l(shared_->mu); PrintRunData(mean_data);
if (shared_->stopping < shared_->threads) { PrintRunData(stddev_data);
CHECK_EQ(state_, STATE_STOPPING); fprintf(stdout, "\n");
return true;
}
state_ = STATE_STOPPED;
return false;
} }
void State::Run() { void ConsoleReporter::PrintRunData(const Run& result) const {
stats_->Reset(); // Format bytes per second
shared_->instance->bm->function_(*this); std::string rate;
{ if (result.bytes_per_second > 0) {
std::lock_guard<std::mutex> l(shared_->mu); rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
shared_->stats.Add(*stats_);
} }
}
void State::RunAsThread() { // Format items per second
thread_ = std::thread(State::RunWrapper, this); std::string items;
} if (result.items_per_second > 0) {
items = StrCat(" ", HumanReadableNumber(result.items_per_second),
void State::Wait() { " items/s");
if (thread_.joinable()) {
thread_.join();
} }
}
// static
void* State::RunWrapper(void* arg) {
State* that = (State*)arg;
CHECK(that != nullptr);
that->Run();
std::lock_guard<std::mutex> l(that->shared_->mu);
that->shared_->exited++; double const multiplier = 1e9; // nano second multiplier
if (that->thread_index > 0 && ColorPrintf(COLOR_GREEN, "%-*s ",
that->shared_->exited == that->shared_->threads - 1) { name_field_width_, result.benchmark_name.c_str());
// All threads but thread 0 have exited the user-provided run function. if (result.iterations == 0) {
// Thread 0 can now wake up and exit. ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
that->shared_->cond.notify_one(); result.real_accumulated_time * multiplier,
result.cpu_accumulated_time * multiplier);
} else {
ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
(result.real_accumulated_time * multiplier) /
(static_cast<double>(result.iterations)),
(result.cpu_accumulated_time * multiplier) /
(static_cast<double>(result.iterations)));
} }
ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
return nullptr; ColorPrintf(COLOR_DEFAULT, "%*s %*s %s\n",
13, rate.c_str(),
18, items.c_str(),
result.report_label.c_str());
} }
namespace internal {
void RunMatchingBenchmarks(const std::string& spec, void RunMatchingBenchmarks(const std::string& spec,
const BenchmarkReporter* reporter) { const BenchmarkReporter* reporter) {
CHECK(reporter != nullptr);
if (spec.empty()) return; if (spec.empty()) return;
std::vector<internal::Benchmark::Instance> benchmarks; std::vector<benchmark::internal::Benchmark::Instance> benchmarks;
BenchmarkFamilies::GetInstance()->FindBenchmarks(spec, &benchmarks); auto families = benchmark::internal::BenchmarkFamilies::GetInstance();
if (!families->FindBenchmarks(spec, &benchmarks)) return;
// Determine the width of the name field using a minimum width of 10. // Determine the width of the name field using a minimum width of 10.
// Also determine max number of threads needed. // Also determine max number of threads needed.
...@@ -1144,45 +885,78 @@ void RunMatchingBenchmarks(const std::string& spec, ...@@ -1144,45 +885,78 @@ void RunMatchingBenchmarks(const std::string& spec,
BenchmarkReporter::Context context; BenchmarkReporter::Context context;
context.num_cpus = NumCPUs(); context.num_cpus = NumCPUs();
context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f; context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f;
// context.cpu_info = base::CompactCPUIDInfoString();
context.cpu_scaling_enabled = CpuScalingEnabled(); context.cpu_scaling_enabled = CpuScalingEnabled();
context.name_field_width = name_field_width; context.name_field_width = name_field_width;
if (reporter->ReportContext(context)) if (reporter->ReportContext(context)) {
for (internal::Benchmark::Instance& benchmark : benchmarks) for (const auto& benchmark : benchmarks) {
Benchmark::RunInstance(benchmark, reporter); RunBenchmark(benchmark, reporter);
}
}
} }
void FindMatchingBenchmarkNames(const std::string& spec, } // end namespace internal
std::vector<std::string>* benchmark_names) {
if (spec.empty()) return;
std::vector<internal::Benchmark::Instance> benchmarks;
BenchmarkFamilies::GetInstance()->FindBenchmarks(spec, &benchmarks);
std::transform(benchmarks.begin(), benchmarks.end(), benchmark_names->begin(),
[](const internal::Benchmark::Instance& b) { return b.name; });
}
} // end namespace internal
void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter /*= nullptr*/) { void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter) {
std::string spec = FLAGS_benchmark_filter; std::string spec = FLAGS_benchmark_filter;
if (spec.empty() || spec == "all") if (spec.empty() || spec == "all")
spec = "."; // Regexp that matches all benchmarks spec = "."; // Regexp that matches all benchmarks
internal::ConsoleReporter default_reporter; internal::ConsoleReporter default_reporter;
internal::RunMatchingBenchmarks( internal::RunMatchingBenchmarks(spec, reporter ? reporter : &default_reporter);
spec, reporter == nullptr ? &default_reporter : reporter); }
namespace internal {
void PrintUsageAndExit() {
fprintf(stdout,
"benchmark"
" [--benchmark_filter=<regex>]\n"
" [--benchmark_iterations=<iterations>]\n"
" [--benchmark_min_time=<min_time>]\n"
" [--benchmark_repetitions=<num_repetitions>]\n"
" [--color_print={true|false}]\n"
" [--v=<verbosity>]\n");
exit(0);
}
void ParseCommandLineFlags(int* argc, const char** argv) {
using namespace benchmark;
for (int i = 1; i < *argc; ++i) {
if (
ParseStringFlag(argv[i], "benchmark_filter",
&FLAGS_benchmark_filter) ||
ParseInt32Flag(argv[i], "benchmark_iterations",
&FLAGS_benchmark_iterations) ||
ParseDoubleFlag(argv[i], "benchmark_min_time",
&FLAGS_benchmark_min_time) ||
ParseInt32Flag(argv[i], "benchmark_repetitions",
&FLAGS_benchmark_repetitions) ||
ParseBoolFlag(argv[i], "color_print",
&FLAGS_color_print) ||
ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
--(*argc);
--i;
} else if (IsFlag(argv[i], "help")) {
PrintUsageAndExit();
}
}
} }
void UseRealTime() { use_real_time = true; } } // end namespace internal
void Initialize(int* argc, const char** argv) { void Initialize(int* argc, const char** argv) {
internal::ParseCommandLineFlags(argc, argv); internal::ParseCommandLineFlags(argc, argv);
internal::SetLogLevel(FLAGS_v); internal::SetLogLevel(FLAGS_v);
// Ensure walltime is initialized by a single thread by forcing the // TODO remove this. It prints some output the first time it is called.
// initialization. // We don't want to have this ouput printed during benchmarking.
MyCPUUsage();
// The first call to walltime::Now initialized it. Call it once to
// prevent the initialization from happening in a benchmark.
walltime::Now(); walltime::Now();
internal::Benchmark::MeasureOverhead();
} }
} // end namespace benchmark } // end namespace benchmark
#ifndef BENCHMARK_MUTEX_H_
#define BENCHMARK_MUTEX_H_
#include <mutex>
#include <condition_variable>
// Enable thread safety attributes only with clang.
// The attributes can be safely erased when compiling with other compilers.
#if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
#else
#define THREAD_ANNOTATION_ATTRIBUTE__(x) // no-op
#endif
#define CAPABILITY(x) \
THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
#define SCOPED_CAPABILITY \
THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
#define GUARDED_BY(x) \
THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
#define PT_GUARDED_BY(x) \
THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
#define ACQUIRED_BEFORE(...) \
THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
#define ACQUIRED_AFTER(...) \
THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
#define REQUIRES(...) \
THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
#define REQUIRES_SHARED(...) \
THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
#define ACQUIRE(...) \
THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
#define ACQUIRE_SHARED(...) \
THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
#define RELEASE(...) \
THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
#define RELEASE_SHARED(...) \
THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
#define TRY_ACQUIRE(...) \
THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
#define TRY_ACQUIRE_SHARED(...) \
THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
#define EXCLUDES(...) \
THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
#define ASSERT_CAPABILITY(x) \
THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
#define ASSERT_SHARED_CAPABILITY(x) \
THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
#define RETURN_CAPABILITY(x) \
THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
#define NO_THREAD_SAFETY_ANALYSIS \
THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
namespace benchmark {
typedef std::condition_variable Condition;
// NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
// we can annotate them with thread safety attributes and use the
// -Wthread-safety warning with clang. The standard library types cannot be
// used directly because they do not provided the required annotations.
class CAPABILITY("mutex") Mutex
{
public:
Mutex() {}
void lock() ACQUIRE() { mut_.lock(); }
void unlock() RELEASE() { mut_.unlock(); }
std::mutex& native_handle() {
return mut_;
}
private:
std::mutex mut_;
};
class SCOPED_CAPABILITY MutexLock
{
typedef std::unique_lock<std::mutex> MutexLockImp;
public:
MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle())
{ }
~MutexLock() RELEASE() {}
MutexLockImp& native_handle() { return ml_; }
private:
MutexLockImp ml_;
};
class Notification
{
public:
Notification() : notified_yet_(false) { }
void WaitForNotification() const EXCLUDES(mutex_) {
MutexLock m_lock(mutex_);
auto notified_fn = [this]() REQUIRES(mutex_) {
return this->HasBeenNotified();
};
cv_.wait(m_lock.native_handle(), notified_fn);
}
void Notify() EXCLUDES(mutex_) {
{
MutexLock lock(mutex_);
notified_yet_ = 1;
}
cv_.notify_all();
}
private:
bool HasBeenNotified() const REQUIRES(mutex_) {
return notified_yet_;
}
mutable Mutex mutex_;
mutable std::condition_variable cv_;
bool notified_yet_ GUARDED_BY(mutex_);
};
} // end namespace benchmark
#endif // BENCHMARK_MUTEX_H_
...@@ -24,13 +24,13 @@ static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits), ...@@ -24,13 +24,13 @@ static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits), static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
"Small SI and Big SI unit arrays must be the same size"); "Small SI and Big SI unit arrays must be the same size");
static const int kUnitsSize = arraysize(kBigSIUnits); static const int64_t kUnitsSize = arraysize(kBigSIUnits);
} // end anonymous namespace } // end anonymous namespace
void ToExponentAndMantissa(double val, double thresh, int precision, void ToExponentAndMantissa(double val, double thresh, int precision,
double one_k, std::string* mantissa, double one_k, std::string* mantissa,
int* exponent) { int64_t* exponent) {
std::stringstream mantissa_stream; std::stringstream mantissa_stream;
if (val < 0) { if (val < 0) {
...@@ -80,10 +80,10 @@ void ToExponentAndMantissa(double val, double thresh, int precision, ...@@ -80,10 +80,10 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
*mantissa = mantissa_stream.str(); *mantissa = mantissa_stream.str();
} }
std::string ExponentToPrefix(int exponent, bool iec) { std::string ExponentToPrefix(int64_t exponent, bool iec) {
if (exponent == 0) return ""; if (exponent == 0) return "";
const int index = (exponent > 0 ? exponent - 1 : -exponent - 1); const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
if (index >= kUnitsSize) return ""; if (index >= kUnitsSize) return "";
const char* array = const char* array =
...@@ -97,7 +97,7 @@ std::string ExponentToPrefix(int exponent, bool iec) { ...@@ -97,7 +97,7 @@ std::string ExponentToPrefix(int exponent, bool iec) {
std::string ToBinaryStringFullySpecified(double value, double threshold, std::string ToBinaryStringFullySpecified(double value, double threshold,
int precision) { int precision) {
std::string mantissa; std::string mantissa;
int exponent; int64_t exponent;
ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa, ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa,
&exponent); &exponent);
return mantissa + ExponentToPrefix(exponent, false); return mantissa + ExponentToPrefix(exponent, false);
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include "check.h" #include "check.h"
#include "cycleclock.h" #include "cycleclock.h"
#include "internal_macros.h" #include "internal_macros.h"
#include "log.h"
#include "sleep.h" #include "sleep.h"
namespace benchmark { namespace benchmark {
...@@ -322,7 +323,7 @@ double MyCPUUsage() { ...@@ -322,7 +323,7 @@ double MyCPUUsage() {
return value; return value;
} }
// Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage(). // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
std::cout << "Reading /proc/self/cputime_ns failed. Using getrusage().\n"; VLOG(1) << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
use_cputime_ns = false; use_cputime_ns = false;
} }
} }
......
...@@ -20,3 +20,6 @@ add_test(filter_regex_none filter_test --benchmark_filter=monkey 0) ...@@ -20,3 +20,6 @@ add_test(filter_regex_none filter_test --benchmark_filter=monkey 0)
add_test(filter_regex_wildcard filter_test --benchmark_filter=.*Calculate.* 16) add_test(filter_regex_wildcard filter_test --benchmark_filter=.*Calculate.* 16)
add_test(filter_regex_begin filter_test --benchmark_filter=^BM_Calculate.* 16) add_test(filter_regex_begin filter_test --benchmark_filter=^BM_Calculate.* 16)
add_test(filter_regex_end filter_test --benchmark_filter=.*Pi$ 8) add_test(filter_regex_end filter_test --benchmark_filter=.*Pi$ 8)
compile_benchmark_test(basic_test)
add_test(basic basic_test)
#include <cstddef>
#include "benchmark/benchmark.h"
#define BASIC_BENCHMARK_TEST(x) \
BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
void BM_empty(benchmark::State& state) {
while (state.KeepRunning()) {
volatile std::size_t x = state.iterations();
((void)x);
}
}
BENCHMARK(BM_empty);
BENCHMARK(BM_empty)->ThreadPerCpu();
void BM_spin_empty(benchmark::State& state) {
while (state.KeepRunning()) {
for (int x = 0; x < state.range_x(); ++x) {
volatile int dummy = x;
((void)dummy);
}
}
}
BASIC_BENCHMARK_TEST(BM_spin_empty);
BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
void BM_spin_pause_before(benchmark::State& state) {
for (int i = 0; i < state.range_x(); ++i) {
volatile int dummy = i;
((void)dummy);
}
while(state.KeepRunning()) {
for (int i = 0; i < state.range_x(); ++i) {
volatile int dummy = i;
((void)dummy);
}
}
}
BASIC_BENCHMARK_TEST(BM_spin_pause_before);
BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
void BM_spin_pause_during(benchmark::State& state) {
while(state.KeepRunning()) {
state.PauseTiming();
for (int i = 0; i < state.range_x(); ++i) {
volatile int dummy = i;
((void)dummy);
}
state.ResumeTiming();
for (int i = 0; i < state.range_x(); ++i) {
volatile int dummy = i;
((void)dummy);
}
}
}
BASIC_BENCHMARK_TEST(BM_spin_pause_during);
BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
void BM_spin_pause_after(benchmark::State& state) {
while(state.KeepRunning()) {
for (int i = 0; i < state.range_x(); ++i) {
volatile int dummy = i;
((void)dummy);
}
}
for (int i = 0; i < state.range_x(); ++i) {
volatile int dummy = i;
((void)dummy);
}
}
BASIC_BENCHMARK_TEST(BM_spin_pause_after);
BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
void BM_spin_pause_before_and_after(benchmark::State& state) {
for (int i = 0; i < state.range_x(); ++i) {
volatile int dummy = i;
((void)dummy);
}
while(state.KeepRunning()) {
for (int i = 0; i < state.range_x(); ++i) {
volatile int dummy = i;
((void)dummy);
}
}
for (int i = 0; i < state.range_x(); ++i) {
volatile int dummy = i;
((void)dummy);
}
}
BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
void BM_empty_stop_start(benchmark::State& state) {
while (state.KeepRunning()) { }
}
BENCHMARK(BM_empty_stop_start);
BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
BENCHMARK_MAIN()
...@@ -53,18 +53,22 @@ static void BM_Factorial(benchmark::State& state) { ...@@ -53,18 +53,22 @@ static void BM_Factorial(benchmark::State& state) {
while (state.KeepRunning()) while (state.KeepRunning())
fac_42 = Factorial(8); fac_42 = Factorial(8);
// Prevent compiler optimizations // Prevent compiler optimizations
std::cout << fac_42; std::stringstream ss;
ss << fac_42;
state.SetLabel(ss.str());
} }
BENCHMARK(BM_Factorial); BENCHMARK(BM_Factorial);
static void BM_FactorialRealTime(benchmark::State& state) { static void BM_FactorialRealTime(benchmark::State& state) {
benchmark::UseRealTime(); state.UseRealTime();
int fac_42 = 0; int fac_42 = 0;
while (state.KeepRunning()) while (state.KeepRunning())
fac_42 = Factorial(8); fac_42 = Factorial(8);
// Prevent compiler optimizations // Prevent compiler optimizations
std::cout << fac_42; std::stringstream ss;
ss << fac_42;
state.SetLabel(ss.str());
} }
BENCHMARK(BM_FactorialRealTime); BENCHMARK(BM_FactorialRealTime);
...@@ -158,12 +162,5 @@ static void BM_LongTest(benchmark::State& state) { ...@@ -158,12 +162,5 @@ static void BM_LongTest(benchmark::State& state) {
} }
BENCHMARK(BM_LongTest)->Range(1<<16,1<<28); BENCHMARK(BM_LongTest)->Range(1<<16,1<<28);
int main(int argc, const char* argv[]) { BENCHMARK_MAIN()
benchmark::Initialize(&argc, argv);
assert(Factorial(8) == 40320);
assert(CalculatePi(1) == 0.0);
benchmark::RunSpecifiedBenchmarks();
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment