Commit 403f3544 by Dominic Hamon

Initial commit

Benchmark library builds and runs but only single-threaded. Multithreaded support needs a bit more love. Currently requires some C++11 support (g++ 4.6.3 seems to work).
parents
CMakeCache.txt
CMakeFiles/
Makefile
bin/
cmake_install.cmake
lib/
cmake_minimum_required (VERSION 2.8)
project (benchmark)
find_package(Threads)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
set(CMAKE_CXX_FLAGS "-Wall -Werror --std=c++0x")
set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DDEBUG")
set(CMAKE_CXX_FLAGS_RELEASE "-fno-strict-aliasing -O3 -DNDEBUG")
# Set OS
if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
add_definitions(-DOS_MACOSX)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
add_definitions(-DOS_LINUX)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
add_definitions(-DOS_WINDOWS)
endif()
# Set CPU
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86")
add_definitions(-DARCH_X86)
endif()
# Set up directories
include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/src)
link_directories(${PROJECT_SOURCE_DIR}/lib)
# Build the targets
FILE(GLOB SOURCE_FILES "src/*.cc")
add_library(benchmark STATIC ${SOURCE_FILES})
add_executable(benchmark_test test/benchmark_test.cc)
target_link_libraries(benchmark_test benchmark ${CMAKE_THREAD_LIBS_INIT})
// Support for registering benchmarks for functions.
/* Example usage:
// Define a function that executes the code to be measured a
// specified number of times:
static void BM_StringCreation(benchmark::State& state) {
while (state.KeepRunning())
std::string empty_string;
}
// Register the function as a benchmark
BENCHMARK(BM_StringCreation);
// Define another benchmark
static void BM_StringCopy(benchmark::State& state) {
std::string x = "hello";
while (state.KeepRunning())
std::string copy(x);
}
BENCHMARK(BM_StringCopy);
// Augment the main() program to invoke benchmarks if specified
// via the --benchmarks command line flag. E.g.,
// my_unittest --benchmarks=all
// my_unittest --benchmarks=BM_StringCreation
// my_unittest --benchmarks=String
// my_unittest --benchmarks='Copy|Creation'
int main(int argc, char** argv) {
Initialize(&argc, argv);
RunSpecifiedBenchmarks();
}
// Sometimes a family of microbenchmarks can be implemented with
// just one routine that takes an extra argument to specify which
// one of the family of benchmarks to run. For example, the following
// code defines a family of microbenchmarks for measuring the speed
// of memcpy() calls of different lengths:
static void BM_memcpy(benchmark::State& state) {
char* src = new char[state.range_x()]; char* dst = new char[state.range_x()];
memset(src, 'x', state.range_x());
while (state.KeepRunning()) {
memcpy(dst, src, state.range_x());
SetBenchmarkBytesProcessed(int64_t_t(state.iterations) * int64(state.range_x()));
delete[] src; delete[] dst;
}
BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
// The preceding code is quite repetitive, and can be replaced with the
// following short-hand. The following invocation will pick a few
// appropriate arguments in the specified range and will generate a
// microbenchmark for each such argument.
BENCHMARK(BM_memcpy)->Range(8, 8<<10);
// You might have a microbenchmark that depends on two inputs. For
// example, the following code defines a family of microbenchmarks for
// measuring the speed of set insertion.
static void BM_SetInsert(benchmark::State& state) {
while (state.KeepRunning()) {
state.PauseTiming();
set<int> data = ConstructRandomSet(state.range_x());
state.ResumeTiming();
for (int j = 0; j < state.rangeY; ++j)
data.insert(RandomNumber());
}
}
BENCHMARK(BM_SetInsert)
->ArgPair(1<<10, 1)
->ArgPair(1<<10, 8)
->ArgPair(1<<10, 64)
->ArgPair(1<<10, 512)
->ArgPair(8<<10, 1)
->ArgPair(8<<10, 8)
->ArgPair(8<<10, 64)
->ArgPair(8<<10, 512);
// The preceding code is quite repetitive, and can be replaced with
// the following short-hand. The following macro will pick a few
// appropriate arguments in the product of the two specified ranges
// and will generate a microbenchmark for each such pair.
BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
// For more complex patterns of inputs, passing a custom function
// to Apply allows programmatic specification of an
// arbitrary set of arguments to run the microbenchmark on.
// The following example enumerates a dense range on
// one parameter, and a sparse range on the second.
static benchmark::internal::Benchmark* CustomArguments(
benchmark::internal::Benchmark* b) {
for (int i = 0; i <= 10; ++i)
for (int j = 32; j <= 1024*1024; j *= 8)
b = b->ArgPair(i, j);
return b;
}
BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
// Templated microbenchmarks work the same way:
// Produce then consume 'size' messages 'iters' times
// Measures throughput in the absence of multiprogramming.
template <class Q> int BM_Sequential(benchmark::State& state) {
Q q;
typename Q::value_type v;
while (state.KeepRunning()) {
for (int i = state.range_x(); i--; )
q.push(v);
for (int e = state.range_x(); e--; )
q.Wait(&v);
}
// actually messages, not bytes:
state.SetBytesProcessed(
static_cast<int64_t>(state.iterations())*state.range_x());
}
BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
In a multithreaded test, it is guaranteed that none of the threads will start
until all have called KeepRunning, and all will have finished before KeepRunning
returns false. As such, any global setup or teardown you want to do can be
wrapped in a check against the thread index:
static void BM_MultiThreaded(benchmark::State& state) {
if (state.thread_index == 0) {
// Setup code here.
}
while (state.KeepRunning()) {
// Run the test as normal.
}
if (state.thread_index == 0) {
// Teardown code here.
}
}
*/
#ifndef BENCHMARK_BENCHMARK_H_
#define BENCHMARK_BENCHMARK_H_
#include <stdint.h>
#include <functional>
#include <string>
#include <vector>
#include "macros.h"
namespace benchmark {
// If the --benchmarks flag is empty, do nothing.
//
// Otherwise, run all benchmarks specified by the --benchmarks flag,
// and exit after running the benchmarks.
extern void RunSpecifiedBenchmarks();
// ------------------------------------------------------
// Routines that can be called from within a benchmark
//
// REQUIRES: a benchmark is currently executing
extern void SetLabel(const std::string& label);
// If this routine is called, peak memory allocation past this point in the
// benchmark is reported at the end of the benchmark report line. (It is
// computed by running the benchmark once with a single iteration and a memory
// tracer.)
extern void MemoryUsage();
// If a particular benchmark is I/O bound, or if for some reason CPU
// timings are not representative, call this method from within the
// benchmark routine. If called, the elapsed time will be used to
// control how many iterations are run, and in the printing of
// items/second or MB/seconds values. If not called, the cpu time
// used by the benchmark will be used.
extern void UseRealTime();
namespace internal {
class Benchmark;
}
// State is passed to a running Benchmark and contains state for the
// benchmark to use.
class State {
public:
// Returns true iff the benchmark should continue through another iteration.
bool KeepRunning();
void PauseTiming();
void ResumeTiming();
// Set the number of bytes processed by the current benchmark
// execution. This routine is typically called once at the end of a
// throughput oriented benchmark. If this routine is called with a
// value > 0, the report is printed in MB/sec instead of nanoseconds
// per iteration.
//
// REQUIRES: a benchmark has exited its KeepRunning loop.
void SetBytesProcessed(int64_t bytes);
// If this routine is called with items > 0, then an items/s
// label is printed on the benchmark report line for the currently
// executing benchmark. It is typically called at the end of a processing
// benchmark where a processing items/second output is desired.
//
// REQUIRES: a benchmark has exited its KeepRunning loop.
void SetItemsProcessed(int64_t items);
// If this routine is called, the specified label is printed at the
// end of the benchmark report line for the currently executing
// benchmark. Example:
// static void BM_Compress(int iters) {
// ...
// double compress = input_size / output_size;
// benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
// }
// Produces output that looks like:
// BM_Compress 50 50 14115038 compress:27.3%
//
// REQUIRES: a benchmark has exited its KeepRunning loop.
void SetLabel(const std::string& label);
// Range arguments for this run. CHECKs if the argument has been set.
int range_x() const;
int range_y() const;
int iterations() const { return total_iterations_; }
const int thread_index;
private:
class FastClock;
struct SharedState;
State(FastClock* clock, SharedState* s, int t);
bool StartRunning();
bool FinishInterval();
bool MaybeStop();
void NewInterval();
bool AllStarting();
bool RunAnotherInterval() const;
void Run();
enum EState {
STATE_INITIAL, // KeepRunning hasn't been called
STATE_STARTING, // KeepRunning called, waiting for other threads
STATE_RUNNING, // Running and being timed
STATE_STOPPING, // Not being timed but waiting for other threads
STATE_STOPPED, // Stopped
} state_;
FastClock* clock_;
// State shared by all BenchmarkRun objects that belong to the same
// BenchmarkInstance
SharedState* shared_;
// Custom label set by the user.
std::string label_;
// Each State object goes through a sequence of measurement intervals. By
// default each interval is approx. 100ms in length. The following stats are
// kept for each interval.
int64_t iterations_;
double start_cpu_;
double start_time_;
int64_t stop_time_micros_;
double start_pause_;
double pause_time_;
// Total number of iterations for all finished runs.
int64_t total_iterations_;
// Approximate time in microseconds for one interval of execution.
// Dynamically adjusted as needed.
int64_t interval_micros_;
// True if the current interval is the continuation of a previous one.
bool is_continuation_;
friend class internal::Benchmark;
DISALLOW_COPY_AND_ASSIGN(State);
};
namespace internal {
class BenchmarkReporter;
typedef std::function<void(State&)> BenchmarkFunction;
// Run all benchmarks whose name is a partial match for the regular
// expression in "spec". The results of benchmark runs are fed to "reporter".
void RunMatchingBenchmarks(const std::string& spec,
BenchmarkReporter* reporter);
// Extract the list of benchmark names that match the specified regular
// expression.
void FindMatchingBenchmarkNames(const std::string& re,
std::vector<std::string>* benchmark_names);
// ------------------------------------------------------
// Benchmark registration object. The BENCHMARK() macro expands
// into an internal::Benchmark* object. Various methods can
// be called on this object to change the properties of the benchmark.
// Each method returns "this" so that multiple method calls can
// chained into one expression.
class Benchmark {
public:
// The Benchmark takes ownership of the Callback pointed to by f.
Benchmark(const char* name, BenchmarkFunction f);
~Benchmark();
// Note: the following methods all return "this" so that multiple
// method calls can be chained together in one expression.
// Run this benchmark once with "x" as the extra argument passed
// to the function.
// REQUIRES: The function passed to the constructor must accept an arg1.
Benchmark* Arg(int x);
// Run this benchmark once for a number of values picked from the
// range [start..limit]. (start and limit are always picked.)
// REQUIRES: The function passed to the constructor must accept an arg1.
Benchmark* Range(int start, int limit);
// Run this benchmark once for every value in the range [start..limit]
// REQUIRES: The function passed to the constructor must accept an arg1.
Benchmark* DenseRange(int start, int limit);
// Run this benchmark once with "x,y" as the extra arguments passed
// to the function.
// REQUIRES: The function passed to the constructor must accept arg1,arg2.
Benchmark* ArgPair(int x, int y);
// Pick a set of values A from the range [lo1..hi1] and a set
// of values B from the range [lo2..hi2]. Run the benchmark for
// every pair of values in the cartesian product of A and B
// (i.e., for all combinations of the values in A and B).
// REQUIRES: The function passed to the constructor must accept arg1,arg2.
Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2);
// Pass this benchmark object to *func, which can customize
// the benchmark by calling various methods like Arg, ArgPair,
// Threads, etc.
Benchmark* Apply(void (*func)(Benchmark* benchmark));
// Support for running multiple copies of the same benchmark concurrently
// in multiple threads. This may be useful when measuring the scaling
// of some piece of code.
// Run one instance of this benchmark concurrently in t threads.
Benchmark* Threads(int t);
// Pick a set of values T from [min_threads,max_threads].
// min_threads and max_threads are always included in T. Run this
// benchmark once for each value in T. The benchmark run for a
// particular value t consists of t threads running the benchmark
// function concurrently. For example, consider:
// BENCHMARK(Foo)->ThreadRange(1,16);
// This will run the following benchmarks:
// Foo in 1 thread
// Foo in 2 threads
// Foo in 4 threads
// Foo in 8 threads
// Foo in 16 threads
Benchmark* ThreadRange(int min_threads, int max_threads);
// Equivalent to ThreadRange(NumCPUs(), NumCPUs())
Benchmark* ThreadPerCpu();
// TODO(dominich): Control whether or not real-time is used for this benchmark
// TODO(dominich): Control the default number of iterations
// -------------------------------
// Following methods are not useful for clients
// Used inside the benchmark implementation
struct Instance;
struct ThreadStats;
// Extract the list of benchmark instances that match the specified
// regular expression.
static void FindBenchmarks(const std::string& re,
std::vector<Instance>* benchmarks);
// Measure the overhead of an empty benchmark to subtract later.
static void MeasureOverhead();
private:
std::vector<Benchmark::Instance> CreateBenchmarkInstances(int rangeXindex,
int rangeYindex);
std::string name_;
BenchmarkFunction function_;
int registration_index_;
std::vector<int> rangeX_;
std::vector<int> rangeY_;
std::vector<int> thread_counts_;
// Special value placed in thread_counts_ to stand for NumCPUs()
static const int kNumCpuMarker = -1;
// Special value used to indicate that no range is required.
static const int kNoRange = -1;
static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
static double MeasurePeakHeapMemory(const Instance& b);
static void RunInstance(const Instance& b, BenchmarkReporter* br);
friend class ::benchmark::State;
friend struct ::benchmark::internal::Benchmark::Instance;
friend void ::benchmark::internal::RunMatchingBenchmarks(
const std::string&, BenchmarkReporter*);
DISALLOW_COPY_AND_ASSIGN(Benchmark);
};
// ------------------------------------------------------
// Benchmarks reporter interface + data containers.
struct BenchmarkContextData {
int num_cpus;
double mhz_per_cpu;
//std::string cpu_info;
bool cpu_scaling_enabled;
// The number of chars in the longest benchmark name.
int name_field_width;
};
struct BenchmarkRunData {
BenchmarkRunData() :
thread_index(-1),
iterations(1),
real_accumulated_time(0),
cpu_accumulated_time(0),
bytes_per_second(0),
items_per_second(0),
max_heapbytes_used(0) {}
std::string benchmark_name;
std::string report_label;
int thread_index;
int64_t iterations;
double real_accumulated_time;
double cpu_accumulated_time;
// Zero if not set by benchmark.
double bytes_per_second;
double items_per_second;
// This is set to 0.0 if memory tracing is not enabled.
double max_heapbytes_used;
};
// Interface for custom benchmark result printers.
// By default, benchmark reports are printed to stdout. However an application
// can control the destination of the reports by calling
// RunMatchingBenchmarks and passing it a custom reporter object.
// The reporter object must implement the following interface.
class BenchmarkReporter {
public:
// Called once for every suite of benchmarks run.
// The parameter "context" contains information that the
// reporter may wish to use when generating its report, for example the
// platform under which the benchmarks are running. The benchmark run is
// never started if this function returns false, allowing the reporter
// to skip runs based on the context information.
virtual bool ReportContext(const BenchmarkContextData& context) = 0;
// Called once for each group of benchmark runs, gives information about
// cpu-time and heap memory usage during the benchmark run.
// Note that all the grouped benchmark runs should refer to the same
// benchmark, thus have the same name.
virtual void ReportRuns(const std::vector<BenchmarkRunData>& report) = 0;
virtual ~BenchmarkReporter();
};
// ------------------------------------------------------
// Internal implementation details follow; please ignore
// Given a collection of reports, computes their mean and stddev.
// REQUIRES: all runs in "reports" must be from the same benchmark.
void ComputeStats(const std::vector<BenchmarkRunData>& reports,
BenchmarkRunData* mean_data,
BenchmarkRunData* stddev_data);
// Simple reporter that outputs benchmark data to the console. This is the
// default reporter used by RunSpecifiedBenchmarks().
class ConsoleReporter : public BenchmarkReporter {
public:
virtual bool ReportContext(const BenchmarkContextData& context);
virtual void ReportRuns(const std::vector<BenchmarkRunData>& reports);
private:
std::string PrintMemoryUsage(double bytes);
virtual void PrintRunData(const BenchmarkRunData& report);
int name_field_width_;
};
} // end namespace internal
void Initialize(int* argc, const char** argv);
} // end namespace benchmark
// ------------------------------------------------------
// Macro to register benchmarks
// Helpers for generating unique variable names
#define BENCHMARK_CONCAT(a, b, c) BENCHMARK_CONCAT2(a, b, c)
#define BENCHMARK_CONCAT2(a, b, c) a ## b ## c
#define BENCHMARK(n) \
static ::benchmark::internal::Benchmark* \
BENCHMARK_CONCAT(__benchmark_, n, __LINE__) ATTRIBUTE_UNUSED = \
(new ::benchmark::internal::Benchmark(#n, n))
// Old-style macros
#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->ArgPair((a1), (a2))
#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
BENCHMARK(n)->RangePair((l1), (h1), (l2), (h2))
// This will register a benchmark for a templatized function. For example:
//
// template<int arg>
// void BM_Foo(int iters);
//
// BENCHMARK_TEMPLATE(BM_Foo, 1);
//
// will register BM_Foo<1> as a benchmark.
#define BENCHMARK_TEMPLATE(n, a) \
static ::benchmark::internal::Benchmark* \
BENCHMARK_CONCAT(__benchmark_, n, __LINE__) ATTRIBUTE_UNUSED = \
(new ::benchmark::internal::Benchmark(#n "<" #a ">", n<a>))
#define BENCHMARK_TEMPLATE2(n, a, b) \
static ::benchmark::internal::Benchmark* \
BENCHMARK_CONCAT(__benchmark_, n, __LINE__) ATTRIBUTE_UNUSED = \
(new ::benchmark::internal::Benchmark(#n "<" #a "," #b ">", n<a, b>))
#endif // BENCHMARK_BENCHMARK_H_
#ifndef BENCHMARK_MACROS_H_
#define BENCHMARK_MACROS_H_
#include <assert.h>
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&); \
void operator=(const TypeName&);
// The arraysize(arr) macro returns the # of elements in an array arr.
// The expression is a compile-time constant, and therefore can be
// used in defining new arrays, for example. If you use arraysize on
// a pointer by mistake, you will get a compile-time error.
//
// One caveat is that, for C++03, arraysize() doesn't accept any array of
// an anonymous type or a type defined inside a function. In these rare
// cases, you have to use the unsafe ARRAYSIZE() macro below. This is
// due to a limitation in C++03's template system. The limitation has
// been removed in C++11.
// This template function declaration is used in defining arraysize.
// Note that the function doesn't need an implementation, as we only
// use its type.
template <typename T, size_t N>
char (&ArraySizeHelper(T (&array)[N]))[N];
// That gcc wants both of these prototypes seems mysterious. VC, for
// its part, can't decide which to use (another mystery). Matching of
// template overloads: the final frontier.
#ifndef COMPILER_MSVC
template <typename T, size_t N>
char (&ArraySizeHelper(const T (&array)[N]))[N];
#endif
#define arraysize(array) (sizeof(ArraySizeHelper(array)))
// The STATIC_ASSERT macro can be used to verify that a compile time
// expression is true. For example, you could use it to verify the
// size of a static array:
//
// STATIC_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
// content_type_names_incorrect_size);
//
// or to make sure a struct is smaller than a certain size:
//
// STATIC_ASSERT(sizeof(foo) < 128, foo_too_large);
//
// The second argument to the macro is the name of the variable. If
// the expression is false, most compilers will issue a warning/error
// containing the name of the variable.
template <bool>
struct StaticAssert {
};
#define STATIC_ASSERT(expr, msg) \
typedef StaticAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
// Implementation details of STATIC_ASSERT:
//
// - STATIC_ASSERT works by defining an array type that has -1
// elements (and thus is invalid) when the expression is false.
//
// - The simpler definition
//
// #define STATIC_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
//
// does not work, as gcc supports variable-length arrays whose sizes
// are determined at run-time (this is gcc's extension and not part
// of the C++ standard). As a result, gcc fails to reject the
// following code with the simple definition:
//
// int foo;
// STATIC_ASSERT(foo, msg); // not supposed to compile as foo is
// // not a compile-time constant.
//
// - By using the type StaticAssert<(bool(expr))>, we ensures that
// expr is a compile-time constant. (Template arguments must be
// determined at compile-time.)
//
// - The outer parentheses in StaticAssert<(bool(expr))> are necessary
// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written
//
// StaticAssert<bool(expr)>
//
// instead, these compilers will refuse to compile
//
// STATIC_ASSERT(5 > 0, some_message);
//
// (They seem to think the ">" in "5 > 0" marks the end of the
// template argument list.)
//
// - The array size is (bool(expr) ? 1 : -1), instead of simply
//
// ((expr) ? 1 : -1).
//
// This is to avoid running into a bug in MS VC 7.1, which
// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
#define CHECK(b) do { if (!(b)) assert(false); } while(0)
#define CHECK_EQ(a, b) CHECK((a) == (b))
#define CHECK_GE(a, b) CHECK((a) >= (b))
#define CHECK_LE(a, b) CHECK((a) <= (b))
#define CHECK_GT(a, b) CHECK((a) > (b))
#define CHECK_LT(a, b) CHECK((a) < (b))
//
// Prevent the compiler from complaining about or optimizing away variables
// that appear unused.
#define ATTRIBUTE_UNUSED __attribute__ ((unused))
//
// For functions we want to force inline or not inline.
// Introduced in gcc 3.1.
#define ATTRIBUTE_ALWAYS_INLINE __attribute__ ((always_inline))
#define HAVE_ATTRIBUTE_ALWAYS_INLINE 1
#define ATTRIBUTE_NOINLINE __attribute__ ((noinline))
#define HAVE_ATTRIBUTE_NOINLINE 1
#endif // BENCHMARK_MACROS_H_
#include "benchmark/benchmark.h"
#include "benchmark/macros.h"
#include "colorprint.h"
#include "commandlineflags.h"
#include "mutex_lock.h"
#include "sleep.h"
#include "stat.h"
#include "sysinfo.h"
#include "walltime.h"
#include <pthread.h>
#include <semaphore.h>
#include <string.h>
#if defined OS_FREEBSD
#include <gnuregex.h>
#else
#include <regex.h>
#endif
#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
DEFINE_string(benchmark_filter, ".",
"A regular expression that specifies the set of benchmarks "
"to execute. If this flag is empty, no benchmarks are run. "
"If this flag is the string \"all\", all benchmarks linked "
"into the process are run.");
DEFINE_int32(benchmark_min_iters, 100,
"Minimum number of iterations per benchmark");
DEFINE_int32(benchmark_max_iters, 1000000000,
"Maximum number of iterations per benchmark");
DEFINE_double(benchmark_min_time, 0.5,
"Minimum number of seconds we should run benchmark before "
"results are considered significant. For cpu-time based "
"tests, this is the lower bound on the total cpu time "
"used by all threads that make up the test. For real-time "
"based tests, this is the lower bound on the elapsed time "
"of the benchmark execution, regardless of number of "
"threads.");
DEFINE_bool(benchmark_memory_usage, false,
"Report memory usage for all benchmarks");
DEFINE_int32(benchmark_repetitions, 1,
"The number of runs of each benchmark. If greater than 1, the "
"mean and standard deviation of the runs will be reported.");
DEFINE_int32(v, 0, "The level of verbose logging to output");
DEFINE_bool(color_print, true, "Enables colorized logging.");
// Will be non-empty if heap checking is turned on, which would
// invalidate any benchmarks.
DECLARE_string(heap_check);
// The ""'s catch people who don't pass in a literal for "str"
#define strliterallen(str) (sizeof("" str "")-1)
// Must use a string literal for prefix.
#define memprefix(str, len, prefix) \
( (((len) >= strliterallen(prefix)) \
&& memcmp(str, prefix, strliterallen(prefix)) == 0) \
? str + strliterallen(prefix) \
: NULL )
namespace benchmark {
namespace {
// kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta.
static const char kBigSIUnits[] = "kMGTPEZY";
// Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi.
static const char kBigIECUnits[] = "KMGTPEZY";
// milli, micro, nano, pico, femto, atto, zepto, yocto.
static const char kSmallSIUnits[] = "munpfazy";
// We require that all three arrays have the same size.
STATIC_ASSERT(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
SI_and_IEC_unit_arrays_must_be_the_same_size);
STATIC_ASSERT(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
Small_SI_and_Big_SI_unit_arrays_must_be_the_same_size);
static const int kUnitsSize = arraysize(kBigSIUnits);
void ToExponentAndMantissa(double val, double thresh,
int precision, double one_k,
std::string* mantissa, int* exponent) {
std::stringstream mantissa_stream;
if (val < 0) {
mantissa_stream << "-";
val = -val;
}
// Adjust threshold so that it never excludes things which can't be rendered
// in 'precision' digits.
const double adjusted_threshold =
std::max(thresh, 1.0 / pow(10.0, precision));
const double big_threshold = adjusted_threshold * one_k;
const double small_threshold = adjusted_threshold;
if (val > big_threshold) {
// Positive powers
double scaled = val;
for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) {
scaled /= one_k;
if (scaled <= big_threshold) {
mantissa_stream << scaled;
*exponent = i + 1;
*mantissa = mantissa_stream.str();
return;
}
}
mantissa_stream << val;
*exponent = 0;
} else if (val < small_threshold) {
// Negative powers
double scaled = val;
for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
scaled *= one_k;
if (scaled >= small_threshold) {
mantissa_stream << scaled;
*exponent = -i - 1;
*mantissa = mantissa_stream.str();
return;
}
}
mantissa_stream << val;
*exponent = 0;
} else {
mantissa_stream << val;
*exponent = 0;
}
*mantissa = mantissa_stream.str();
}
std::string ExponentToPrefix(int exponent, bool iec) {
if (exponent == 0)
return "";
const int index = (exponent > 0 ? exponent - 1 : -exponent - 1);
if (index >= kUnitsSize)
return "";
const char *array = (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) :
kSmallSIUnits);
if (iec)
return array[index] + std::string("i");
else
return std::string(1, array[index]);
}
std::string ToBinaryStringFullySpecified(double value, double threshold,
int precision) {
std::string mantissa;
int exponent;
ToExponentAndMantissa(value, threshold, precision, 1024., &mantissa,
&exponent);
return mantissa + ExponentToPrefix(exponent, false);
}
inline void AppendHumanReadable(int n, std::string* str) {
std::stringstream ss;
// Round down to the nearest SI prefix.
ss << "/" << ToBinaryStringFullySpecified(n, 1.0, 0);
*str += ss.str();
}
inline std::string HumanReadableNumber(double n) {
// 1.1 means that figures up to 1.1k should be shown with the next unit down;
// this softens edge effects.
// 1 means that we should show one decimal place of precision.
return ToBinaryStringFullySpecified(n, 1.1, 1);
}
} // end namespace
namespace internal {
struct Benchmark::ThreadStats {
int64_t bytes_processed;
int64_t items_processed;
ThreadStats() { Reset(); }
void Reset() {
bytes_processed = 0;
items_processed = 0;
}
void Add(const ThreadStats& other) {
bytes_processed += other.bytes_processed;
items_processed += other.items_processed;
}
};
} // end namespace internal
namespace {
// Per-thread stats
pthread_key_t thread_stats_key;
internal::Benchmark::ThreadStats* thread_stats = nullptr;
// For non-dense Range, intermediate values are powers of kRangeMultiplier.
static const int kRangeMultiplier = 8;
// List of all registered benchmarks. Note that each registered
// benchmark identifies a family of related benchmarks to run.
static pthread_mutex_t benchmark_mutex;
static std::vector<internal::Benchmark*>* families = NULL;
bool running_benchmark = false;
// Should this benchmark report memory usage?
bool get_memory_usage;
// Should this benchmark base decisions off of real time rather than
// cpu time?
bool use_real_time;
// Overhead of an empty benchmark.
double overhead = 0.0;
void DeleteThreadStats(void* p) {
delete (internal::Benchmark::ThreadStats*) p;
}
// Return prefix to print in front of each reported line
const char* Prefix() {
#ifdef NDEBUG
return "";
#else
return "DEBUG: ";
#endif
}
// TODO
//static internal::MallocCounter *benchmark_mc;
static bool CpuScalingEnabled() {
// On Linux, the CPUfreq subsystem exposes CPU information as files on the
// local file system. If reading the exported files fails, then we may not be
// running on Linux, so we silently ignore all the read errors.
for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
std::stringstream ss;
ss << "/sys/devices/system/cpu/cpu" << cpu << "/cpufreq/scaling_governor";
std::string governor_file = ss.str();
FILE* file = fopen(governor_file.c_str(), "r");
if (!file)
break;
char buff[16];
size_t bytes_read = fread(buff, 1, sizeof(buff), file);
fclose(file);
if (memprefix(buff, bytes_read, "performance") == NULL)
return true;
}
return false;
}
} // namespace
namespace internal {
BenchmarkReporter::~BenchmarkReporter() {}
void ComputeStats(const std::vector<BenchmarkRunData>& reports,
BenchmarkRunData* mean_data,
BenchmarkRunData* stddev_data) {
// Accumulators.
Stat1_d real_accumulated_time_stat;
Stat1_d cpu_accumulated_time_stat;
Stat1_d bytes_per_second_stat;
Stat1_d items_per_second_stat;
Stat1MinMax_d max_heapbytes_used_stat;
int total_iters = 0;
// Populate the accumulators.
for (std::vector<BenchmarkRunData>::const_iterator it = reports.begin();
it != reports.end(); ++it) {
CHECK_EQ(reports[0].benchmark_name, it->benchmark_name);
total_iters += it->iterations;
real_accumulated_time_stat +=
Stat1_d(it->real_accumulated_time/it->iterations, it->iterations);
cpu_accumulated_time_stat +=
Stat1_d(it->cpu_accumulated_time/it->iterations, it->iterations);
items_per_second_stat += Stat1_d(it->items_per_second, it->iterations);
bytes_per_second_stat += Stat1_d(it->bytes_per_second, it->iterations);
max_heapbytes_used_stat += Stat1MinMax_d(it->max_heapbytes_used,
it->iterations);
}
// Get the data from the accumulator to BenchmarkRunData's.
mean_data->benchmark_name = reports[0].benchmark_name + "_mean";
mean_data->iterations = total_iters;
mean_data->real_accumulated_time = real_accumulated_time_stat.Sum();
mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Sum();
mean_data->bytes_per_second = bytes_per_second_stat.Mean();
mean_data->items_per_second = items_per_second_stat.Mean();
mean_data->max_heapbytes_used = max_heapbytes_used_stat.Max();
// Only add label to mean/stddev if it is same for all runs
mean_data->report_label = reports[0].report_label;
for (size_t i = 1; i < reports.size(); i++) {
if (reports[i].report_label != reports[0].report_label) {
mean_data->report_label = "";
break;
}
}
stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev";
stddev_data->report_label = mean_data->report_label;
stddev_data->iterations = total_iters;
// We multiply by total_iters since PrintRunData expects a total time.
stddev_data->real_accumulated_time =
real_accumulated_time_stat.StdDev() * total_iters;
stddev_data->cpu_accumulated_time =
cpu_accumulated_time_stat.StdDev() * total_iters;
stddev_data->bytes_per_second = bytes_per_second_stat.StdDev();
stddev_data->items_per_second = items_per_second_stat.StdDev();
stddev_data->max_heapbytes_used = max_heapbytes_used_stat.StdDev();
}
std::string ConsoleReporter::PrintMemoryUsage(double bytes) {
if (!get_memory_usage || bytes < 0.0)
return "";
std::stringstream ss;
ss << " " << HumanReadableNumber(bytes) << "B peak-mem";
return ss.str();
}
bool ConsoleReporter::ReportContext(const BenchmarkContextData& context) {
name_field_width_ = context.name_field_width;
std::cout << "Benchmarking on " << context.num_cpus << " X "
<< context.mhz_per_cpu << " MHz CPU"
<< ((context.num_cpus > 1) ? "s" : "") << "\n";
int remainder_ms;
char time_buf[32];
std::cout << walltime::Print(walltime::Now(), "%Y/%m/%d-%H:%M:%S",
true, // use local timezone
time_buf, &remainder_ms) << "\n";
// Show details of CPU model, caches, TLBs etc.
// if (!context.cpu_info.empty())
// std::cout << "CPU: " << context.cpu_info.c_str();
if (context.cpu_scaling_enabled) {
std::cerr << "CPU scaling is enabled: Benchmark timings may be noisy.\n";
}
int output_width = fprintf(stdout, "%s%-*s %10s %10s %10s\n",
Prefix(), name_field_width_, "Benchmark",
"Time(ns)", "CPU(ns)", "Iterations");
std::cout << std::string(output_width - 1, '-').c_str() << "\n";
return true;
}
void ConsoleReporter::ReportRuns(const std::vector<BenchmarkRunData>& reports) {
for (std::vector<BenchmarkRunData>::const_iterator it = reports.begin();
it != reports.end(); ++it) {
CHECK_EQ(reports[0].benchmark_name, it->benchmark_name);
PrintRunData(*it);
}
// We don't report aggregated data if there was a single run.
if (reports.size() < 2)
return;
BenchmarkRunData mean_data;
BenchmarkRunData stddev_data;
internal::ComputeStats(reports, &mean_data, &stddev_data);
// Output using PrintRun.
PrintRunData(mean_data);
PrintRunData(stddev_data);
fprintf(stdout, "\n");
}
void ConsoleReporter::PrintRunData(const BenchmarkRunData& result) {
// Format bytes per second
std::string rate;
if (result.bytes_per_second > 0) {
std::stringstream ss;
ss << " " << HumanReadableNumber(result.bytes_per_second) << "B/s";
rate = ss.str();
}
// Format items per second
std::string items;
if (result.items_per_second > 0) {
std::stringstream ss;
ss << " " << HumanReadableNumber(result.items_per_second) << " items/s";
items = ss.str();
}
ColorPrintf(COLOR_DEFAULT, "%s", Prefix());
ColorPrintf(COLOR_GREEN, "%-*s ",
name_field_width_, result.benchmark_name.c_str());
ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
(result.real_accumulated_time * 1e9) /
(static_cast<double>(result.iterations)),
(result.cpu_accumulated_time * 1e9) /
(static_cast<double>(result.iterations)));
ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
ColorPrintf(COLOR_DEFAULT, "%*s %s %s%s\n", 16, rate.c_str(), items.c_str(),
result.report_label.c_str(),
PrintMemoryUsage(result.max_heapbytes_used).c_str());
}
void MemoryUsage() {
//if (benchmark_mc) {
// benchmark_mc->Reset();
//} else {
get_memory_usage = true;
//}
}
void UseRealTime() {
use_real_time = true;
}
void PrintUsageAndExit() {
fprintf(stdout, "benchmark [--benchmark_filter=<regex>]\n"
" [--benchmark_min_iters=<min_iters>]\n"
" [--benchmark_max_iters=<max_iters>]\n"
" [--benchmark_min_time=<min_time>]\n"
// " [--benchmark_memory_usage]\n"
" [--benchmark_repetitions=<num_repetitions>]\n"
" [--color_print={true|false}]\n"
" [--v=<verbosity>]\n");
exit(0);
}
void ParseCommandLineFlags(int* argc, const char** argv) {
for (int i = 1; i < *argc; ++i) {
if (ParseStringFlag(argv[i], "benchmark_filter",
&FLAGS_benchmark_filter) ||
ParseInt32Flag(argv[i], "benchmark_min_iters",
&FLAGS_benchmark_min_iters) ||
ParseInt32Flag(argv[i], "benchmark_max_iters",
&FLAGS_benchmark_max_iters) ||
ParseDoubleFlag(argv[i], "benchmark_min_time",
&FLAGS_benchmark_min_time) ||
// TODO(dominic)
// ParseBoolFlag(argv[i], "gbenchmark_memory_usage",
// &FLAGS_gbenchmark_memory_usage) ||
ParseInt32Flag(argv[i], "benchmark_repetitions",
&FLAGS_benchmark_repetitions) ||
ParseBoolFlag(argv[i], "color_print", &FLAGS_color_print) ||
ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
for (int j = i; j != *argc; ++j)
argv[j] = argv[j + 1];
--(*argc);
--i;
} else if (IsFlag(argv[i], "help"))
PrintUsageAndExit();
}
}
} // end namespace internal
// A clock that provides a fast mechanism to check if we're nearly done.
class State::FastClock {
public:
enum Type { REAL_TIME, CPU_TIME };
explicit FastClock(Type type)
: type_(type), approx_time_(NowMicros()) {
sem_init(&bg_done_, 0, 0);
pthread_create(&bg_, NULL, &BGThreadWrapper, this);
}
~FastClock() {
sem_post(&bg_done_);
pthread_join(bg_, NULL);
sem_destroy(&bg_done_);
}
// Returns true if the current time is guaranteed to be past "when_micros".
// This method is very fast.
inline bool HasReached(int64_t when_micros) {
return approx_time_ >= when_micros;
// NOTE: this is the same as we're dealing with an int64_t
//return (base::subtle::NoBarrier_Load(&approx_time_) >= when_micros);
}
// Returns the current time in microseconds past the epoch.
int64_t NowMicros() const {
double t = 0;
switch (type_) {
case REAL_TIME:
t = walltime::Now();
break;
case CPU_TIME:
t = MyCPUUsage() + ChildrenCPUUsage();
break;
}
return static_cast<int64_t>(t * 1e6);
}
// Reinitialize if necessary (since clock type may be change once benchmark
// function starts running - see UseRealTime).
void InitType(Type type) {
type_ = type;
approx_time_ = NowMicros();
// NOTE: This is the same barring a memory barrier
// base::subtle::Release_Store(&approx_time_, NowMicros());
}
private:
Type type_;
int64_t approx_time_; // Last time measurement taken by bg_
pthread_t bg_; // Background thread that updates last_time_ once every ms
sem_t bg_done_;
static void* BGThreadWrapper(void* that) {
((FastClock*)that)->BGThread();
return NULL;
}
void BGThread() {
int done = 0;
do {
SleepForMicroseconds(1000);
approx_time_ = NowMicros();
// NOTE: same code but no memory barrier. think on it.
//base::subtle::Release_Store(&approx_time_, NowMicros());
sem_getvalue(&bg_done_, &done);
} while (done == 0);
}
DISALLOW_COPY_AND_ASSIGN(FastClock);
};
namespace internal {
const int Benchmark::kNumCpuMarker;
// Information kept per benchmark we may want to run
struct Benchmark::Instance {
Instance()
: rangeXset(false), rangeX(kNoRange),
rangeYset(false), rangeY(kNoRange) {}
std::string name;
Benchmark* bm;
bool rangeXset;
int rangeX;
bool rangeYset;
int rangeY;
int threads; // Number of concurrent threads to use
bool multithreaded() const { return !bm->thread_counts_.empty(); }
};
} // end namespace internal
struct State::SharedState {
const internal::Benchmark::Instance* instance;
pthread_mutex_t mu;
int starting; // Number of threads that have entered STARTING state
int stopping; // Number of threads that have entered STOPPING state
int threads; // Number of total threads that are running concurrently
internal::Benchmark::ThreadStats stats;
std::vector<internal::BenchmarkRunData> runs; // accumulated runs
std::string label;
SharedState(const internal::Benchmark::Instance* b, int t)
: instance(b), starting(0), stopping(0), threads(t) {
}
DISALLOW_COPY_AND_ASSIGN(SharedState);
};
namespace internal {
Benchmark::Benchmark(const char* name, BenchmarkFunction f)
: name_(name), function_(f) {
mutex_lock l(&benchmark_mutex);
if (families == nullptr)
families = new std::vector<Benchmark*>;
registration_index_ = families->size();
families->push_back(this);
}
Benchmark::~Benchmark() {
mutex_lock l(&benchmark_mutex);
CHECK((*families)[registration_index_] == this);
(*families)[registration_index_] = NULL;
// Shrink the vector if convenient.
while (!families->empty() && families->back() == NULL)
families->pop_back();
}
Benchmark* Benchmark::Arg(int x) {
mutex_lock l(&benchmark_mutex);
rangeX_.push_back(x);
return this;
}
Benchmark* Benchmark::Range(int start, int limit) {
std::vector<int> arglist;
AddRange(&arglist, start, limit, kRangeMultiplier);
mutex_lock l(&benchmark_mutex);
for (size_t i = 0; i < arglist.size(); ++i)
rangeX_.push_back(arglist[i]);
return this;
}
Benchmark* Benchmark::DenseRange(int start, int limit) {
CHECK_GE(start, 0);
CHECK_LE(start, limit);
mutex_lock l(&benchmark_mutex);
for (int arg = start; arg <= limit; ++arg)
rangeX_.push_back(arg);
return this;
}
Benchmark* Benchmark::ArgPair(int x, int y) {
mutex_lock l(&benchmark_mutex);
rangeX_.push_back(x);
rangeY_.push_back(y);
return this;
}
Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
std::vector<int> arglist1, arglist2;
AddRange(&arglist1, lo1, hi1, kRangeMultiplier);
AddRange(&arglist2, lo2, hi2, kRangeMultiplier);
mutex_lock l(&benchmark_mutex);
rangeX_.resize(arglist1.size());
std::copy(arglist1.begin(), arglist1.end(), rangeX_.begin());
rangeY_.resize(arglist2.size());
std::copy(arglist2.begin(), arglist2.end(), rangeY_.begin());
return this;
}
Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
custom_arguments(this);
return this;
}
Benchmark* Benchmark::Threads(int t) {
CHECK_GT(t, 0);
mutex_lock l(&benchmark_mutex);
thread_counts_.push_back(t);
return this;
}
Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
CHECK_GT(min_threads, 0);
CHECK_GE(max_threads, min_threads);
mutex_lock l(&benchmark_mutex);
AddRange(&thread_counts_, min_threads, max_threads, 2);
return this;
}
Benchmark* Benchmark::ThreadPerCpu() {
mutex_lock l(&benchmark_mutex);
thread_counts_.push_back(kNumCpuMarker);
return this;
}
void Benchmark::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
CHECK_GE(lo, 0);
CHECK_GE(hi, lo);
// Add "lo"
dst->push_back(lo);
// Now space out the benchmarks in multiples of "mult"
for (int32_t i = 1; i < std::numeric_limits<int32_t>::max()/mult; i *= mult) {
if (i >= hi) break;
if (i > lo)
dst->push_back(i);
}
// Add "hi" (if different from "lo")
if (hi != lo)
dst->push_back(hi);
}
std::vector<Benchmark::Instance> Benchmark::CreateBenchmarkInstances(
int rangeXindex, int rangeYindex) {
// Special list of thread counts to use when none are specified
std::vector<int> one_thread;
one_thread.push_back(1);
std::vector<Benchmark::Instance> instances;
const bool is_multithreaded = (!thread_counts_.empty());
const std::vector<int>* thread_counts =
(is_multithreaded ? &thread_counts_ : &one_thread);
for (size_t t = 0; t < thread_counts->size(); ++t) {
int num_threads = (*thread_counts)[t];
if (num_threads == kNumCpuMarker)
num_threads = NumCPUs();
Instance instance;
instance.name = name_;
instance.bm = this;
instance.threads = num_threads;
if (rangeXindex != kNoRange) {
instance.rangeX = rangeX_[rangeXindex];
instance.rangeXset = true;
AppendHumanReadable(instance.rangeX, &instance.name);
}
if (rangeYindex != kNoRange) {
instance.rangeY = rangeY_[rangeYindex];
instance.rangeYset = true;
AppendHumanReadable(instance.rangeY, &instance.name);
}
// Add the number of threads used to the name
if (is_multithreaded) {
std::stringstream ss;
ss << "/threads:" << instance.threads;
instance.name += ss.str();
}
instances.push_back(instance);
}
return instances;
}
// Extract the list of benchmark instances that match the specified
// regular expression.
void Benchmark::FindBenchmarks(const std::string& spec,
std::vector<Instance>* benchmarks) {
// Make regular expression out of command-line flag
regex_t re;
int ec = regcomp(&re, spec.c_str(), REG_EXTENDED | REG_NOSUB);
if (ec != 0) {
size_t needed = regerror(ec, &re, NULL, 0);
char* errbuf = new char[needed];
regerror(ec, &re, errbuf, needed);
std::cerr << "Could not compile benchmark re: " << errbuf << "\n";
delete[] errbuf;
return;
}
mutex_lock l(&benchmark_mutex);
for (Benchmark* family : *families) {
if (family == nullptr) continue; // Family was deleted
// Match against filter.
if (regexec(&re, family->name_.c_str(), 0, NULL, 0) != 0) {
#ifdef DEBUG
std::cout << "Skipping " << family->name_ << "\n";
#endif
continue;
}
std::vector<Benchmark::Instance> instances;
if (family->rangeX_.empty() && family->rangeY_.empty()) {
instances = family->CreateBenchmarkInstances(kNoRange, kNoRange);
benchmarks->insert(benchmarks->end(), instances.begin(), instances.end());
} else if (family->rangeY_.empty()) {
for (size_t x = 0; x < family->rangeX_.size(); ++x) {
instances = family->CreateBenchmarkInstances(x, kNoRange);
benchmarks->insert(benchmarks->end(),
instances.begin(), instances.end());
}
} else {
for (size_t x = 0; x < family->rangeX_.size(); ++x) {
for (size_t y = 0; y < family->rangeY_.size(); ++y) {
instances = family->CreateBenchmarkInstances(x, y);
benchmarks->insert(benchmarks->end(),
instances.begin(), instances.end());
}
}
}
}
}
void Benchmark::MeasureOverhead() {
State::FastClock clock(State::FastClock::CPU_TIME);
State::SharedState state(NULL, 1);
State runner(&clock, &state, 0);
while (runner.KeepRunning()) {}
overhead = state.runs[0].real_accumulated_time /
static_cast<double>(state.runs[0].iterations);
#ifdef DEBUG
std::cout << "Per-iteration overhead for doing nothing: " << overhead << "\n";
#endif
}
void Benchmark::RunInstance(const Instance& b, BenchmarkReporter* br) {
use_real_time = false;
running_benchmark = true;
// get_memory_usage = FLAGS_gbenchmark_memory_usage;
State::FastClock clock(State::FastClock::CPU_TIME);
// Initialize the test runners.
State::SharedState state(&b, b.threads);
{
std::unique_ptr<State> runners[b.threads];
// TODO: create thread objects
for (int i = 0; i < b.threads; ++i)
runners[i].reset(new State(&clock, &state, i));
// Run them all.
for (int i = 0; i < b.threads; ++i) {
State* r = runners[i].release();
if (b.multithreaded()) {
// TODO: start pthreads (member of state?) and set up thread local
// pointers to stats
//pool->Add(base::NewCallback(r, &State::Run));
} else {
pthread_setspecific(thread_stats_key, thread_stats);
r->Run();
}
}
if (b.multithreaded()) {
// TODO: join all the threads
//pool->JoinAll();
}
}
/*
double mem_usage = 0;
if (get_memory_usage) {
// Measure memory usage
Notification mem_done;
BenchmarkRun mem_run;
BenchmarkRun::SharedState mem_shared(&b, 1);
mem_run.Init(&clock, &mem_shared, 0);
{
testing::MallocCounter mc(testing::MallocCounter::THIS_THREAD_ONLY);
benchmark_mc = &mc;
mem_run.Run(&mem_done);
mem_done.WaitForNotification();
benchmark_mc = NULL;
mem_usage = mc.PeakHeapGrowth();
}
}
*/
running_benchmark = false;
for (internal::BenchmarkRunData& report : state.runs) {
double seconds = (use_real_time ? report.real_accumulated_time :
report.cpu_accumulated_time);
// TODO: add the thread index here?
report.benchmark_name = b.name;
report.report_label = state.label;
report.bytes_per_second = state.stats.bytes_processed / seconds;
report.items_per_second = state.stats.items_processed / seconds;
report.max_heapbytes_used = MeasurePeakHeapMemory(b);
}
br->ReportRuns(state.runs);
}
// Run the specified benchmark, measure its peak memory usage, and
// return the peak memory usage.
double Benchmark::MeasurePeakHeapMemory(const Instance& b) {
if (!get_memory_usage)
return 0.0;
double bytes = 0.0;
/* TODO(dominich)
// Should we do multi-threaded runs?
const int num_threads = 1;
const int num_iters = 1;
{
// internal::MallocCounter mc(internal::MallocCounter::THIS_THREAD_ONLY);
running_benchmark = true;
timer_manager = new TimerManager(1, NULL);
// benchmark_mc = &mc;
timer_manager->StartTimer();
b.Run(num_iters);
running_benchmark = false;
delete timer_manager;
timer_manager = NULL;
// benchmark_mc = NULL;
// bytes = mc.PeakHeapGrowth();
}
*/
return bytes;
}
} // end namespace internal
State::State(FastClock* clock, SharedState* s, int t)
: thread_index(t),
state_(STATE_INITIAL),
clock_(clock),
shared_(s),
iterations_(0),
start_cpu_(0.0),
start_time_(0.0),
stop_time_micros_(0.0),
start_pause_(0.0),
pause_time_(0.0),
total_iterations_(0),
interval_micros_(
static_cast<int64_t>(1e6 * FLAGS_benchmark_min_time /
FLAGS_benchmark_repetitions)) {
}
bool State::KeepRunning() {
// Fast path
if (!clock_->HasReached(stop_time_micros_ + pause_time_)) {
++iterations_;
return true;
}
switch(state_) {
case STATE_INITIAL: return StartRunning();
case STATE_STARTING: CHECK(false); return true;
case STATE_RUNNING: return FinishInterval();
case STATE_STOPPING: return MaybeStop();
case STATE_STOPPED: CHECK(false); return true;
}
CHECK(false);
return false;
}
void State::PauseTiming() {
start_pause_ = walltime::Now();
}
void State::ResumeTiming() {
pause_time_ += walltime::Now() - start_pause_;
}
void State::SetBytesProcessed(int64_t bytes) {
CHECK_EQ(STATE_STOPPED, state_);
mutex_lock l(&shared_->mu);
internal::Benchmark::ThreadStats* thread_stats =
(internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key);
thread_stats->bytes_processed = bytes;
}
void State::SetItemsProcessed(int64_t items) {
CHECK_EQ(STATE_STOPPED, state_);
mutex_lock l(&shared_->mu);
internal::Benchmark::ThreadStats* thread_stats =
(internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key);
thread_stats->items_processed = items;
}
void State::SetLabel(const std::string& label) {
CHECK_EQ(STATE_STOPPED, state_);
mutex_lock l(&shared_->mu);
shared_->label = label;
}
int State::range_x() const {
CHECK(shared_->instance->rangeXset);
/*
<<
"Failed to get range_x as it was not set. Did you register your "
"benchmark with a range parameter?";
*/
return shared_->instance->rangeX;
}
int State::range_y() const {
CHECK(shared_->instance->rangeYset);
/* <<
"Failed to get range_y as it was not set. Did you register your "
"benchmark with a range parameter?";
*/
return shared_->instance->rangeY;
}
bool State::StartRunning() {
{
mutex_lock l(&shared_->mu);
CHECK_EQ(state_, STATE_INITIAL);
state_ = STATE_STARTING;
is_continuation_ = false;
CHECK_LT(shared_->starting, shared_->threads);
++shared_->starting;
if (shared_->starting == shared_->threads) {
// Last thread to start.
clock_->InitType(
use_real_time ? FastClock::REAL_TIME : FastClock::CPU_TIME);
} else {
// Wait for others.
// TODO(dominic): semaphore!
// while (pthread_getsemaphore(shared_->starting_sem_) !=
// shared_->threads) { }
//shared_->mu.Await(base::Condition(this, &State::AllStarting));
}
CHECK_EQ(state_, STATE_STARTING);
state_ = STATE_RUNNING;
}
NewInterval();
return true;
}
bool State::AllStarting() {
CHECK_LE(shared_->starting, shared_->threads);
return shared_->starting == shared_->threads;
}
void State::NewInterval() {
stop_time_micros_ = clock_->NowMicros() + interval_micros_;
if (!is_continuation_) {
#ifdef DEBUG
std::cout << "Starting new interval; stopping in " << interval_micros_
<< "\n";
#endif
iterations_ = 0;
pause_time_ = 0;
start_cpu_ = MyCPUUsage() + ChildrenCPUUsage();
start_time_ = walltime::Now();
} else {
#ifdef DEBUG
std::cout << "Continuing interval; stopping in " << interval_micros_
<< "\n";
#endif
}
}
bool State::FinishInterval() {
if (iterations_ < FLAGS_benchmark_min_iters / FLAGS_benchmark_repetitions &&
interval_micros_ < 5000000) {
interval_micros_ *= 2;
#ifdef DEBUG
std::cout << "Interval was too short; trying again for "
<< interval_micros_ << " useconds.\n";
#endif
is_continuation_ = false;
NewInterval();
return true;
}
internal::BenchmarkRunData data;
data.thread_index = thread_index;
data.iterations = iterations_;
data.thread_index = thread_index;
const double accumulated_time = walltime::Now() - start_time_;
const double total_overhead = 0.0; // TODO: overhead * iterations_;
CHECK_LT(pause_time_, accumulated_time);
CHECK_LT(pause_time_ + total_overhead, accumulated_time);
data.real_accumulated_time =
accumulated_time - (pause_time_ + total_overhead);
data.cpu_accumulated_time = (MyCPUUsage() + ChildrenCPUUsage()) - start_cpu_;
total_iterations_ += iterations_;
bool keep_going = false;
{
mutex_lock l(&shared_->mu);
if (is_continuation_)
shared_->runs.back() = data;
else
shared_->runs.push_back(data);
keep_going = RunAnotherInterval();
if (!keep_going) {
++shared_->stopping;
if (shared_->stopping < shared_->threads) {
// Other threads are still running, so continue running but without
// timing to present an expected background load to the other threads.
state_ = STATE_STOPPING;
keep_going = true;
} else {
state_ = STATE_STOPPED;
}
}
}
if (state_ == STATE_RUNNING) {
is_continuation_ = true;
NewInterval();
}
return keep_going;
}
bool State::RunAnotherInterval() const {
if (total_iterations_ < FLAGS_benchmark_min_iters)
return true;
if (total_iterations_ > FLAGS_benchmark_max_iters)
return false;
if (static_cast<int>(shared_->runs.size()) >= FLAGS_benchmark_repetitions)
return false;
return true;
}
bool State::MaybeStop() {
mutex_lock l(&shared_->mu);
if (shared_->stopping < shared_->threads) {
CHECK_EQ(state_, STATE_STOPPING);
return true;
}
state_ = STATE_STOPPED;
return false;
}
void State::Run() {
internal::Benchmark::ThreadStats* thread_stats =
(internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key);
thread_stats->Reset();
shared_->instance->bm->function_(*this);
{
mutex_lock l(&shared_->mu);
shared_->stats.Add(*thread_stats);
}
}
namespace internal {
void RunMatchingBenchmarks(const std::string& spec,
BenchmarkReporter* reporter) {
CHECK(reporter != NULL);
if (spec.empty()) return;
std::vector<internal::Benchmark::Instance> benchmarks;
internal::Benchmark::FindBenchmarks(spec, &benchmarks);
// Determine the width of the name field using a minimum width of 10.
// Also determine max number of threads needed.
int name_field_width = 10;
for (const internal::Benchmark::Instance& benchmark : benchmarks) {
// Add width for _stddev and threads:XX
if (benchmark.threads > 1 && FLAGS_benchmark_repetitions > 1) {
name_field_width = std::max<int>(name_field_width,
benchmark.name.size() + 17);
} else if (benchmark.threads> 1) {
name_field_width = std::max<int>(name_field_width,
benchmark.name.size() + 10);
} else if (FLAGS_benchmark_repetitions > 1) {
name_field_width = std::max<int>(name_field_width,
benchmark.name.size() + 7);
} else {
name_field_width = std::max<int>(name_field_width,
benchmark.name.size());
}
}
// Print header here
BenchmarkContextData context;
context.num_cpus = NumCPUs();
context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f;
// context.cpu_info = base::CompactCPUIDInfoString();
context.cpu_scaling_enabled = CpuScalingEnabled();
context.name_field_width = name_field_width;
if (reporter->ReportContext(context)) {
for (internal::Benchmark::Instance& benchmark : benchmarks) {
//std::unique_ptr<thread::ThreadPool> pool;
//if (benchmark.threads > 0) {
// pool = new thread::ThreadPool(benchmark.threads);
// pool->StartWorkers();
//}
Benchmark::RunInstance(/*pool, */benchmark, reporter);
}
}
}
void FindMatchingBenchmarkNames(const std::string& spec,
std::vector<std::string>* benchmark_names) {
if (spec.empty()) return;
std::vector<internal::Benchmark::Instance> benchmarks;
internal::Benchmark::FindBenchmarks(spec, &benchmarks);
std::transform(benchmarks.begin(), benchmarks.end(), benchmark_names->begin(),
[] (const internal::Benchmark::Instance& b) { return b.name; } );
}
} // end namespace internal
void RunSpecifiedBenchmarks() {
std::string spec = FLAGS_benchmark_filter;
if (spec.empty() || spec == "all")
spec = "."; // Regexp that matches all benchmarks
internal::ConsoleReporter default_reporter;
internal::RunMatchingBenchmarks(spec, &default_reporter);
}
void Initialize(int* argc, const char** argv) {
//AtomicOps_Internalx86CPUFeaturesInit();
pthread_mutex_init(&benchmark_mutex, nullptr);
pthread_key_create(&thread_stats_key, DeleteThreadStats);
thread_stats = new internal::Benchmark::ThreadStats();
walltime::Initialize();
internal::Benchmark::MeasureOverhead();
internal::ParseCommandLineFlags(argc, argv);
}
} // end namespace benchmark
#include "colorprint.h"
#include <stdarg.h>
#include "commandlineflags.h"
DECLARE_bool(color_print);
namespace {
#ifdef OS_WINDOWS
typedef WORD PlatformColorCode;
#else
typedef const char* PlatformColorCode;
#endif
PlatformColorCode GetPlatformColorCode(LogColor color) {
#ifdef OS_WINDOWS
switch (color) {
case COLOR_RED: return FOREGROUND_RED;
case COLOR_GREEN: return FOREGROUND_GREEN;
case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
case COLOR_BLUE: return FOREGROUND_BLUE;
case COLOR_MAGENTA: return FOREGROUND_BLUE | FOREGROUND_RED;
case COLOR_CYAN: return FOREGROUND_BLUE | FOREGROUND_GREEN;
case COLOR_WHITE: // fall through to default
default: return 0;
}
#else
switch (color) {
case COLOR_RED: return "1";
case COLOR_GREEN: return "2";
case COLOR_YELLOW: return "3";
case COLOR_BLUE: return "4";
case COLOR_MAGENTA: return "5";
case COLOR_CYAN: return "6";
case COLOR_WHITE: return "7";
default: return NULL;
};
#endif
}
} // end namespace
void ColorPrintf(LogColor color, const char* fmt, ...) {
va_list args;
va_start(args, fmt);
if (!FLAGS_color_print) {
vprintf(fmt, args);
va_end(args);
return;
}
#ifdef OS_WINDOWS
const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
// Gets the current text color.
CONSOLE_SCREEN_BUFFER_INFO buffer_info;
GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
const WORD old_color_attrs = buffer_info.wAttributes;
// We need to flush the stream buffers into the console before each
// SetConsoleTextAttribute call lest it affect the text that is already
// printed but has not yet reached the console.
fflush(stdout);
SetConsoleTextAttribute(stdout_handle,
GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
vprintf(fmt, args);
fflush(stdout);
// Restores the text color.
SetConsoleTextAttribute(stdout_handle, old_color_attrs);
#else
const char* color_code = GetPlatformColorCode(color);
if (color_code)
fprintf(stdout, "\033[0;3%sm", color_code);
vprintf(fmt, args);
printf("\033[m"); // Resets the terminal to default.
#endif
va_end(args);
}
#ifndef BENCHMARK_COLORPRINT_H_
#define BENCHMARK_COLORPRINT_H_
enum LogColor {
COLOR_DEFAULT,
COLOR_RED,
COLOR_GREEN,
COLOR_YELLOW,
COLOR_BLUE,
COLOR_MAGENTA,
COLOR_CYAN,
COLOR_WHITE
};
void ColorPrintf(LogColor color, const char* fmt, ...);
#endif // BENCHMARK_COLORPRINT_H_
#include "commandlineflags.h"
#include <string.h>
#include <iostream>
#include <limits>
namespace benchmark {
// Parses 'str' for a 32-bit signed integer. If successful, writes
// the result to *value and returns true; otherwise leaves *value
// unchanged and returns false.
bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) {
// Parses the environment variable as a decimal integer.
char* end = NULL;
const long long_value = strtol(str, &end, 10); // NOLINT
// Has strtol() consumed all characters in the string?
if (*end != '\0') {
// No - an invalid character was encountered.
std::cerr << src_text << " is expected to be a 32-bit integer, "
<< "but actually has value \"" << str << "\".\n";
return false;
}
// Is the parsed value in the range of an Int32?
const int32_t result = static_cast<int32_t>(long_value);
if (long_value == std::numeric_limits<long>::max() ||
long_value == std::numeric_limits<long>::min() ||
// The parsed value overflows as a long. (strtol() returns
// LONG_MAX or LONG_MIN when the input overflows.)
result != long_value
// The parsed value overflows as an Int32.
) {
std::cerr << src_text << " is expected to be a 32-bit integer, "
<< "but actually has value \"" << str << "\", "
<< "which overflows.\n";
return false;
}
*value = result;
return true;
}
// Parses 'str' for a double. If successful, writes the result to *value and
// returns true; otherwise leaves *value unchanged and returns false.
bool ParseDouble(const std::string& src_text, const char* str, double* value) {
// Parses the environment variable as a decimal integer.
char* end = NULL;
const double double_value = strtod(str, &end); // NOLINT
// Has strtol() consumed all characters in the string?
if (*end != '\0') {
// No - an invalid character was encountered.
std::cerr << src_text << " is expected to be a double, "
<< "but actually has value \"" << str << "\".\n";
return false;
}
*value = double_value;
return true;
}
inline const char* GetEnv(const char* name) {
#if GTEST_OS_WINDOWS_MOBILE
// We are on Windows CE, which has no environment variables.
return NULL;
#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
// Environment variables which we programmatically clear will be set to the
// empty string rather than unset (NULL). Handle that case.
const char* const env = getenv(name);
return (env != NULL && env[0] != '\0') ? env : NULL;
#else
return getenv(name);
#endif
}
// Returns the name of the environment variable corresponding to the
// given flag. For example, FlagToEnvVar("foo") will return
// "BENCHMARK_FOO" in the open-source version.
static std::string FlagToEnvVar(const char* flag) {
const std::string flag_str(flag);
std::string env_var;
for (size_t i = 0; i != flag_str.length(); ++i)
env_var += ::toupper(flag_str.c_str()[i]);
return "BENCHMARK_" + env_var;
}
// Reads and returns the Boolean environment variable corresponding to
// the given flag; if it's not set, returns default_value.
//
// The value is considered true iff it's not "0".
bool BoolFromEnv(const char* flag, bool default_value) {
const std::string env_var = FlagToEnvVar(flag);
const char* const string_value = GetEnv(env_var.c_str());
return string_value == NULL ?
default_value : strcmp(string_value, "0") != 0;
}
// Reads and returns a 32-bit integer stored in the environment
// variable corresponding to the given flag; if it isn't set or
// doesn't represent a valid 32-bit integer, returns default_value.
int32_t Int32FromEnv(const char* flag, int32_t default_value) {
const std::string env_var = FlagToEnvVar(flag);
const char* const string_value = GetEnv(env_var.c_str());
if (string_value == NULL) {
// The environment variable is not set.
return default_value;
}
int32_t result = default_value;
if (!ParseInt32(std::string("Environment variable ") + env_var,
string_value, &result)) {
std::cout << "The default value " << default_value << " is used.\n";
return default_value;
}
return result;
}
// Reads and returns the string environment variable corresponding to
// the given flag; if it's not set, returns default_value.
const char* StringFromEnv(const char* flag, const char* default_value) {
const std::string env_var = FlagToEnvVar(flag);
const char* const value = GetEnv(env_var.c_str());
return value == NULL ? default_value : value;
}
// Parses a string as a command line flag. The string should have
// the format "--flag=value". When def_optional is true, the "=value"
// part can be omitted.
//
// Returns the value of the flag, or NULL if the parsing failed.
const char* ParseFlagValue(const char* str,
const char* flag,
bool def_optional) {
// str and flag must not be NULL.
if (str == NULL || flag == NULL) return NULL;
// The flag must start with "--".
const std::string flag_str = std::string("--") + std::string(flag);
const size_t flag_len = flag_str.length();
if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
// Skips the flag name.
const char* flag_end = str + flag_len;
// When def_optional is true, it's OK to not have a "=value" part.
if (def_optional && (flag_end[0] == '\0'))
return flag_end;
// If def_optional is true and there are more characters after the
// flag name, or if def_optional is false, there must be a '=' after
// the flag name.
if (flag_end[0] != '=') return NULL;
// Returns the string after "=".
return flag_end + 1;
}
bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
// Gets the value of the flag as a string.
const char* const value_str = ParseFlagValue(str, flag, true);
// Aborts if the parsing failed.
if (value_str == NULL) return false;
// Converts the string value to a bool.
*value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
return true;
}
bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
// Gets the value of the flag as a string.
const char* const value_str = ParseFlagValue(str, flag, false);
// Aborts if the parsing failed.
if (value_str == NULL) return false;
// Sets *value to the value of the flag.
return ParseInt32(std::string("The value of flag --") + flag,
value_str, value);
}
bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
// Gets the value of the flag as a string.
const char* const value_str = ParseFlagValue(str, flag, false);
// Aborts if the parsing failed.
if (value_str == NULL) return false;
// Sets *value to the value of the flag.
return ParseDouble(std::string("The value of flag --") + flag,
value_str, value);
}
bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
// Gets the value of the flag as a string.
const char* const value_str = ParseFlagValue(str, flag, false);
// Aborts if the parsing failed.
if (value_str == NULL) return false;
*value = value_str;
return true;
}
bool IsFlag(const char* str, const char* flag) {
return (ParseFlagValue(str, flag, true) != NULL);
}
} // end namespace benchmark
#ifndef BENCHMARK_COMMANDLINEFLAGS_H_
#define BENCHMARK_COMMANDLINEFLAGS_H_
#include <stdint.h>
#include <string>
// Macro for referencing flags.
#define FLAG(name) FLAGS_##name
// Macros for declaring flags.
#define DECLARE_bool(name) extern bool FLAG(name)
#define DECLARE_int32(name) extern int32_t FLAG(name)
#define DECLARE_int64(name) extern int64_t FLAG(name)
#define DECLARE_double(name) extern double FLAG(name)
#define DECLARE_string(name) extern std::string FLAG(name)
// Macros for defining flags.
#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
#define DEFINE_string(name, default_val, doc) \
std::string FLAG(name) = (default_val)
namespace benchmark {
// Parses 'str' for a 32-bit signed integer. If successful, writes the result
// to *value and returns true; otherwise leaves *value unchanged and returns
// false.
bool ParseInt32(const std::string& src_text, const char* str, int32_t* value);
// Parses a bool/Int32/string from the environment variable
// corresponding to the given Google Test flag.
bool BoolFromEnv(const char* flag, bool default_val);
int32_t Int32FromEnv(const char* flag, int32_t default_val);
double DoubleFromEnv(const char* flag, double default_val);
const char* StringFromEnv(const char* flag, const char* default_val);
// Parses a string for a bool flag, in the form of either
// "--flag=value" or "--flag".
//
// In the former case, the value is taken as true as long as it does
// not start with '0', 'f', or 'F'.
//
// In the latter case, the value is taken as true.
//
// On success, stores the value of the flag in *value, and returns
// true. On failure, returns false without changing *value.
bool ParseBoolFlag(const char* str, const char* flag, bool* value);
// Parses a string for an Int32 flag, in the form of
// "--flag=value".
//
// On success, stores the value of the flag in *value, and returns
// true. On failure, returns false without changing *value.
bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
// Parses a string for a Double flag, in the form of
// "--flag=value".
//
// On success, stores the value of the flag in *value, and returns
// true. On failure, returns false without changing *value.
bool ParseDoubleFlag(const char* str, const char* flag, double* value);
// Parses a string for a string flag, in the form of
// "--flag=value".
//
// On success, stores the value of the flag in *value, and returns
// true. On failure, returns false without changing *value.
bool ParseStringFlag(const char* str, const char* flag, std::string* value);
// Returns true if the string matches the flag.
bool IsFlag(const char* str, const char* flag);
} // end namespace gbenchmark
#endif // BENCHMARK_COMMANDLINEFLAGS_H_
// ----------------------------------------------------------------------
// CycleClock
// A CycleClock tells you the current time in Cycles. The "time"
// is actually time since power-on. This is like time() but doesn't
// involve a system call and is much more precise.
//
// NOTE: Not all cpu/platform/kernel combinations guarantee that this
// clock increments at a constant rate or is synchronized across all logical
// cpus in a system.
//
// If you need the above guarantees, please consider using a different
// API. There are efforts to provide an interface which provides a millisecond
// granularity and implemented as a memory read. A memory read is generally
// cheaper than the CycleClock for many architectures.
//
// Also, in some out of order CPU implementations, the CycleClock is not
// serializing. So if you're trying to count at cycles granularity, your
// data might be inaccurate due to out of order instruction execution.
// ----------------------------------------------------------------------
#ifndef BENCHMARK_CYCLECLOCK_H_
#define BENCHMARK_CYCLECLOCK_H_
#include <stdint.h>
#if defined(OS_MACOSX)
# include <mach/mach_time.h>
#endif
// For MSVC, we want to use '_asm rdtsc' when possible (since it works
// with even ancient MSVC compilers), and when not possible the
// __rdtsc intrinsic, declared in <intrin.h>. Unfortunately, in some
// environments, <windows.h> and <intrin.h> have conflicting
// declarations of some other intrinsics, breaking compilation.
// Therefore, we simply declare __rdtsc ourselves. See also
// http://connect.microsoft.com/VisualStudio/feedback/details/262047
#if defined(COMPILER_MSVC) && !defined(_M_IX86)
extern "C" uint64_t __rdtsc();
#pragma intrinsic(__rdtsc)
#endif
#include <sys/time.h>
// NOTE: only i386 and x86_64 have been well tested.
// PPC, sparc, alpha, and ia64 are based on
// http://peter.kuscsik.com/wordpress/?p=14
// with modifications by m3b. See also
// https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
struct CycleClock {
// This should return the number of cycles since power-on. Thread-safe.
static inline int64_t Now() {
#if defined(OS_MACOSX)
// this goes at the top because we need ALL Macs, regardless of
// architecture, to return the number of "mach time units" that
// have passed since startup. See sysinfo.cc where
// InitializeSystemInfo() sets the supposed cpu clock frequency of
// macs to the number of mach time units per second, not actual
// CPU clock frequency (which can change in the face of CPU
// frequency scaling). Also note that when the Mac sleeps, this
// counter pauses; it does not continue counting, nor does it
// reset to zero.
return mach_absolute_time();
#elif defined(__i386__)
int64_t ret;
__asm__ volatile ("rdtsc" : "=A" (ret) );
return ret;
#elif defined(__x86_64__) || defined(__amd64__)
uint64_t low, high;
__asm__ volatile ("rdtsc" : "=a" (low), "=d" (high));
return (high << 32) | low;
#elif defined(__powerpc__) || defined(__ppc__)
// This returns a time-base, which is not always precisely a cycle-count.
int64_t tbl, tbu0, tbu1;
asm("mftbu %0" : "=r" (tbu0));
asm("mftb %0" : "=r" (tbl));
asm("mftbu %0" : "=r" (tbu1));
tbl &= -static_cast<int64>(tbu0 == tbu1);
// high 32 bits in tbu1; low 32 bits in tbl (tbu0 is garbage)
return (tbu1 << 32) | tbl;
#elif defined(__sparc__)
int64_t tick;
asm(".byte 0x83, 0x41, 0x00, 0x00");
asm("mov %%g1, %0" : "=r" (tick));
return tick;
#elif defined(__ia64__)
int64_t itc;
asm("mov %0 = ar.itc" : "=r" (itc));
return itc;
#elif defined(COMPILER_MSVC) && defined(_M_IX86)
// Older MSVC compilers (like 7.x) don't seem to support the
// __rdtsc intrinsic properly, so I prefer to use _asm instead
// when I know it will work. Otherwise, I'll use __rdtsc and hope
// the code is being compiled with a non-ancient compiler.
_asm rdtsc
#elif defined(COMPILER_MSVC)
return __rdtsc();
#elif defined(ARMV3)
#if defined(ARMV6) // V6 is the earliest arch that has a standard cyclecount
uint32_t pmccntr;
uint32_t pmuseren;
uint32_t pmcntenset;
// Read the user mode perf monitor counter access permissions.
asm("mrc p15, 0, %0, c9, c14, 0" : "=r" (pmuseren));
if (pmuseren & 1) { // Allows reading perfmon counters for user mode code.
asm("mrc p15, 0, %0, c9, c12, 1" : "=r" (pmcntenset));
if (pmcntenset & 0x80000000ul) { // Is it counting?
asm("mrc p15, 0, %0, c9, c13, 0" : "=r" (pmccntr));
// The counter is set up to count every 64th cycle
return static_cast<int64>(pmccntr) * 64; // Should optimize to << 6
}
}
#endif
struct timeval tv;
gettimeofday(&tv, NULL);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
#elif defined(__mips__)
// mips apparently only allows rdtsc for superusers, so we fall
// back to gettimeofday. It's possible clock_gettime would be better.
struct timeval tv;
gettimeofday(&tv, NULL);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
#else
// The soft failover to a generic implementation is automatic only for ARM.
// For other platforms the developer is expected to make an attempt to create
// a fast implementation and use generic version if nothing better is available.
#error You need to define CycleTimer for your OS and CPU
#endif
}
};
#endif // BENCHMARK_CYCLECLOCK_H_
#ifndef BENCHMARK_MACROS_H_
#define BENCHMARK_MACROS_H_
#include <assert.h>
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&); \
void operator=(const TypeName&);
// The arraysize(arr) macro returns the # of elements in an array arr.
// The expression is a compile-time constant, and therefore can be
// used in defining new arrays, for example. If you use arraysize on
// a pointer by mistake, you will get a compile-time error.
//
// One caveat is that, for C++03, arraysize() doesn't accept any array of
// an anonymous type or a type defined inside a function. In these rare
// cases, you have to use the unsafe ARRAYSIZE() macro below. This is
// due to a limitation in C++03's template system. The limitation has
// been removed in C++11.
// This template function declaration is used in defining arraysize.
// Note that the function doesn't need an implementation, as we only
// use its type.
template <typename T, size_t N>
char (&ArraySizeHelper(T (&array)[N]))[N];
// That gcc wants both of these prototypes seems mysterious. VC, for
// its part, can't decide which to use (another mystery). Matching of
// template overloads: the final frontier.
#ifndef COMPILER_MSVC
template <typename T, size_t N>
char (&ArraySizeHelper(const T (&array)[N]))[N];
#endif
#define arraysize(array) (sizeof(ArraySizeHelper(array)))
// The STATIC_ASSERT macro can be used to verify that a compile time
// expression is true. For example, you could use it to verify the
// size of a static array:
//
// STATIC_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
// content_type_names_incorrect_size);
//
// or to make sure a struct is smaller than a certain size:
//
// STATIC_ASSERT(sizeof(foo) < 128, foo_too_large);
//
// The second argument to the macro is the name of the variable. If
// the expression is false, most compilers will issue a warning/error
// containing the name of the variable.
template <bool>
struct StaticAssert {
};
#define STATIC_ASSERT(expr, msg) \
typedef StaticAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
// Implementation details of STATIC_ASSERT:
//
// - STATIC_ASSERT works by defining an array type that has -1
// elements (and thus is invalid) when the expression is false.
//
// - The simpler definition
//
// #define STATIC_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
//
// does not work, as gcc supports variable-length arrays whose sizes
// are determined at run-time (this is gcc's extension and not part
// of the C++ standard). As a result, gcc fails to reject the
// following code with the simple definition:
//
// int foo;
// STATIC_ASSERT(foo, msg); // not supposed to compile as foo is
// // not a compile-time constant.
//
// - By using the type StaticAssert<(bool(expr))>, we ensures that
// expr is a compile-time constant. (Template arguments must be
// determined at compile-time.)
//
// - The outer parentheses in StaticAssert<(bool(expr))> are necessary
// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written
//
// StaticAssert<bool(expr)>
//
// instead, these compilers will refuse to compile
//
// STATIC_ASSERT(5 > 0, some_message);
//
// (They seem to think the ">" in "5 > 0" marks the end of the
// template argument list.)
//
// - The array size is (bool(expr) ? 1 : -1), instead of simply
//
// ((expr) ? 1 : -1).
//
// This is to avoid running into a bug in MS VC 7.1, which
// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
#define CHECK(b) do { if (!(b)) assert(false); } while(0)
#define CHECK_EQ(a, b) CHECK((a) == (b))
#define CHECK_GE(a, b) CHECK((a) >= (b))
#define CHECK_LE(a, b) CHECK((a) <= (b))
#define CHECK_GT(a, b) CHECK((a) > (b))
#define CHECK_LT(a, b) CHECK((a) < (b))
#define ATTRIBUTE_UNUSED __attribute__ ((unused))
#endif // BENCHMARK_MACROS_H_
#ifndef BENCHMARK_MUTEX_LOCK_H_
#define BENCHMARK_MUTEX_LOCK_H_
#include <pthread.h>
class mutex_lock {
public:
explicit mutex_lock(pthread_mutex_t* mu) : mu_(mu) {
pthread_mutex_lock(mu_);
}
~mutex_lock() {
pthread_mutex_unlock(mu_);
}
private:
pthread_mutex_t* mu_;
};
#endif // BENCHMARK_MUTEX_LOCK_H_
#ifndef BENCHMARK_PORT_H_
#define BENCHMARK_PORT_H_
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&); \
void operator=(const TypeName&);
#endif // BENCHMARK_PORT_H_
#include "sleep.h"
#include <time.h>
#include <errno.h>
#ifdef OS_WINDOWS
// Window's _sleep takes milliseconds argument.
void SleepForMilliseconds(int milliseconds) {
_sleep(milliseconds);
}
void SleepForSeconds(double seconds) {
SleepForMilliseconds(static_cast<int>(seconds * 1000));
}
#else // OS_WINDOWS
static const int64_t kNumMillisPerSecond = 1000LL;
static const int64_t kNumMicrosPerMilli = 1000LL;
static const int64_t kNumMicrosPerSecond = kNumMillisPerSecond * 1000LL;
static const int64_t kNumNanosPerMicro = 1000LL;
void SleepForMicroseconds(int64_t microseconds) {
struct timespec sleep_time;
sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
; // Ignore signals and wait for the full interval to elapse.
}
void SleepForMilliseconds(int milliseconds) {
SleepForMicroseconds(static_cast<int64_t>(milliseconds) * kNumMicrosPerMilli);
}
void SleepForSeconds(double seconds) {
SleepForMicroseconds(static_cast<int64_t>(seconds * kNumMicrosPerSecond));
}
#endif // OS_WINDOWS
#ifndef BENCHMARK_SLEEP_H_
#define BENCHMARK_SLEEP_H_
#include <stdint.h>
void SleepForMicroseconds(int64_t microseconds);
void SleepForMilliseconds(int milliseconds);
void SleepForSeconds(double seconds);
#endif // BENCHMARK_SLEEP_H_
#ifndef BENCHMARK_STAT_H_
#define BENCHMARK_STAT_H_
#include <math.h>
#include <iostream>
#include <limits>
template <typename VType, typename NumType>
class Stat1;
template <typename VType, typename NumType>
class Stat1MinMax;
typedef Stat1<float, float> Stat1_f;
typedef Stat1<double, double> Stat1_d;
typedef Stat1MinMax<float, float> Stat1MinMax_f;
typedef Stat1MinMax<double, double> Stat1MinMax_d;
template <typename VType> class Vector2;
template <typename VType> class Vector3;
template <typename VType> class Vector4;
template <typename VType, typename NumType>
class Stat1 {
public:
typedef Stat1<VType, NumType> Self;
Stat1() {
Clear();
}
void Clear() {
numsamples_ = NumType();
sum_squares_ = sum_ = VType();
}
// Create a sample of value dat and weight 1
explicit Stat1(const VType &dat) {
sum_ = dat;
sum_squares_ = Sqr(dat);
numsamples_ = 1;
}
// Create statistics for all the samples between begin (included)
// and end(excluded)
explicit Stat1(const VType *begin, const VType *end) {
Clear();
for ( const VType *item = begin; item < end; ++item ) {
(*this) += Stat1(*item);
}
}
// Create a sample of value dat and weight w
Stat1(const VType &dat, const NumType &w) {
sum_ = w * dat;
sum_squares_ = w * Sqr(dat);
numsamples_ = w;
}
// Copy operator
Stat1(const Self &stat) {
sum_ = stat.sum_;
sum_squares_ = stat.sum_squares_;
numsamples_ = stat.numsamples_;
}
inline Self &operator =(const Self &stat) {
sum_ = stat.sum_;
sum_squares_ = stat.sum_squares_;
numsamples_ = stat.numsamples_;
return (*this);
}
// Merge statistics from two sample sets.
inline Self &operator +=(const Self &stat) {
sum_ += stat.sum_;
sum_squares_+= stat.sum_squares_;
numsamples_ += stat.numsamples_;
return (*this);
}
// The operation opposite to +=
inline Self &operator -=(const Self &stat) {
sum_ -= stat.sum_;
sum_squares_-= stat.sum_squares_;
numsamples_ -= stat.numsamples_;
return (*this);
}
// Multiply the weight of the set of samples by a factor k
inline Self &operator *=(const VType &k) {
sum_ *= k;
sum_squares_*= k;
numsamples_ *= k;
return (*this);
}
// Merge statistics from two sample sets.
inline Self operator + (const Self &stat) const {
return Self(*this) += stat;
}
// The operation opposite to +
inline Self operator - (const Self &stat) const {
return Self(*this) -= stat;
}
// Multiply the weight of the set of samples by a factor k
inline Self operator * (const VType &k) const {
return Self(*this) *= k;
}
// Return the total weight of this sample set
NumType NumSamples() const {
return numsamples_;
}
// Return the sum of this sample set
VType Sum() const {
return sum_;
}
// Return the mean of this sample set
VType Mean() const {
if (numsamples_ == 0) return VType();
return sum_ * (1.0 / numsamples_);
}
// Return the mean of this sample set and compute the standard deviation at
// the same time.
VType Mean(VType *stddev) const {
if (numsamples_ == 0) return VType();
VType mean = sum_ * (1.0 / numsamples_);
if (stddev) {
VType avg_squares = sum_squares_ * (1.0 / numsamples_);
*stddev = Sqrt(avg_squares - Sqr(mean));
}
return mean;
}
// Return the standard deviation of the sample set
VType StdDev() const {
if (numsamples_ == 0) return VType();
VType mean = Mean();
VType avg_squares = sum_squares_ * (1.0 / numsamples_);
return Sqrt(avg_squares - Sqr(mean));
}
private:
// Let i be the index of the samples provided (using +=)
// and weight[i],value[i] be the data of sample #i
// then the variables have the following meaning:
NumType numsamples_; // sum of weight[i];
VType sum_; // sum of weight[i]*value[i];
VType sum_squares_; // sum of weight[i]*value[i]^2;
// Template function used to square a number.
// For a vector we square all components
template <typename SType>
static inline SType Sqr(const SType &dat) {
return dat * dat;
}
template <typename SType>
static inline Vector2<SType> Sqr(const Vector2<SType> &dat) {
return dat.MulComponents(dat);
}
template <typename SType>
static inline Vector3<SType> Sqr(const Vector3<SType> &dat) {
return dat.MulComponents(dat);
}
template <typename SType>
static inline Vector4<SType> Sqr(const Vector4<SType> &dat) {
return dat.MulComponents(dat);
}
// Template function used to take the square root of a number.
// For a vector we square all components
template <typename SType>
static inline SType Sqrt(const SType &dat) {
// Avoid NaN due to imprecision in the calculations
if ( dat < 0 )
return 0;
return sqrt(dat);
}
template <typename SType>
static inline Vector2<SType> Sqrt(const Vector2<SType> &dat) {
// Avoid NaN due to imprecision in the calculations
return Max(dat, Vector2<SType>()).Sqrt();
}
template <typename SType>
static inline Vector3<SType> Sqrt(const Vector3<SType> &dat) {
// Avoid NaN due to imprecision in the calculations
return Max(dat, Vector3<SType>()).Sqrt();
}
template <typename SType>
static inline Vector4<SType> Sqrt(const Vector4<SType> &dat) {
// Avoid NaN due to imprecision in the calculations
return Max(dat, Vector4<SType>()).Sqrt();
}
};
// Useful printing function
template <typename VType, typename NumType>
inline std::ostream& operator<<(std::ostream& out,
const Stat1<VType, NumType>& s) {
out << "{ avg = " << s.Mean()
<< " std = " << s.StdDev()
<< " nsamples = " << s.NumSamples() << "}";
return out;
}
// Stat1MinMax: same as Stat1, but it also
// keeps the Min and Max values; the "-"
// operator is disabled because it cannot be implemented
// efficiently
template <typename VType, typename NumType>
class Stat1MinMax : public Stat1<VType, NumType> {
public:
typedef Stat1MinMax<VType, NumType> Self;
Stat1MinMax() {
Clear();
}
void Clear() {
Stat1<VType, NumType>::Clear();
if (std::numeric_limits<VType>::has_infinity) {
min_ = std::numeric_limits<VType>::infinity();
max_ = -std::numeric_limits<VType>::infinity();
} else {
min_ = std::numeric_limits<VType>::max();
max_ = std::numeric_limits<VType>::min();
}
}
// Create a sample of value dat and weight 1
explicit Stat1MinMax(const VType &dat) : Stat1<VType, NumType>(dat) {
max_ = dat;
min_ = dat;
}
// Create statistics for all the samples between begin (included)
// and end(excluded)
explicit Stat1MinMax(const VType *begin, const VType *end) {
Clear();
for ( const VType *item = begin; item < end; ++item ) {
(*this) += Stat1MinMax(*item);
}
}
// Create a sample of value dat and weight w
Stat1MinMax(const VType &dat, const NumType &w)
: Stat1<VType, NumType>(dat, w) {
max_ = dat;
min_ = dat;
}
// Copy operator
Stat1MinMax(const Self &stat) : Stat1<VType, NumType>(stat) {
max_ = stat.max_;
min_ = stat.min_;
}
inline Self &operator =(const Self &stat) {
this->Stat1<VType, NumType>::operator=(stat);
max_ = stat.max_;
min_ = stat.min_;
return (*this);
}
// Merge statistics from two sample sets.
inline Self &operator +=(const Self &stat) {
this->Stat1<VType, NumType>::operator+=(stat);
if (stat.max_ > max_) max_ = stat.max_;
if (stat.min_ < min_) min_ = stat.min_;
return (*this);
}
// Multiply the weight of the set of samples by a factor k
inline Self &operator *=(const VType &stat) {
this->Stat1<VType, NumType>::operator*=(stat);
return (*this);
}
// Merge statistics from two sample sets.
inline Self operator + (const Self &stat) const {
return Self(*this) += stat;
}
// Multiply the weight of the set of samples by a factor k
inline Self operator * (const VType &k) const {
return Self(*this) *= k;
}
private:
// The - operation makes no sense with Min/Max
// unless we keep the full list of values (but we don't)
// make it private, and let it undefined so nobody can call it
Self &operator -=(const Self &stat); // senseless. let it undefined.
// The operation opposite to -
Self operator - (const Self &stat) const; // senseless. let it undefined.
public:
// Return the maximal value in this sample set
VType Max() const {
return max_;
}
// Return the minimal value in this sample set
VType Min() const {
return min_;
}
private:
// Let i be the index of the samples provided (using +=)
// and weight[i],value[i] be the data of sample #i
// then the variables have the following meaning:
VType max_; // max of value[i]
VType min_; // min of value[i]
};
// Useful printing function
template <typename VType, typename NumType>
inline std::ostream& operator <<(std::ostream& out,
const Stat1MinMax<VType, NumType>& s) {
out << "{ avg = " << s.Mean()
<< " std = " << s.StdDev()
<< " nsamples = " << s.NumSamples()
<< " min = " << s.Min()
<< " max = " << s.Max() << "}";
return out;
}
#endif // BENCHMARK_STAT_H_
#include "sysinfo.h"
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <unistd.h>
#include <iostream>
#include <limits>
#include "cycleclock.h"
#include "macros.h"
#include "mutex_lock.h"
#include "sleep.h"
namespace {
pthread_once_t cpuinfo_init = PTHREAD_ONCE_INIT;
double cpuinfo_cycles_per_second = 1.0;
int cpuinfo_num_cpus = 1; // Conservative guess
static pthread_mutex_t cputimens_mutex;
// Helper function estimates cycles/sec by observing cycles elapsed during
// sleep(). Using small sleep time decreases accuracy significantly.
int64_t EstimateCyclesPerSecond(const int estimate_time_ms) {
CHECK(estimate_time_ms > 0);
double multiplier = 1000.0 / (double)estimate_time_ms; // scale by this much
const int64_t start_ticks = CycleClock::Now();
SleepForMilliseconds(estimate_time_ms);
const int64_t guess = int64_t(multiplier * (CycleClock::Now() - start_ticks));
return guess;
}
// Helper function for reading an int from a file. Returns true if successful
// and the memory location pointed to by value is set to the value read.
bool ReadIntFromFile(const char *file, int *value) {
bool ret = false;
int fd = open(file, O_RDONLY);
if (fd != -1) {
char line[1024];
char* err;
memset(line, '\0', sizeof(line));
CHECK(read(fd, line, sizeof(line) - 1));
const int temp_value = strtol(line, &err, 10);
if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
*value = temp_value;
ret = true;
}
close(fd);
}
return ret;
}
void InitializeSystemInfo() {
bool saw_mhz = false;
// TODO: destroy this
pthread_mutex_init(&cputimens_mutex, NULL);
#if defined OS_LINUX || defined OS_CYGWIN
char line[1024];
char* err;
int freq;
// If the kernel is exporting the tsc frequency use that. There are issues
// where cpuinfo_max_freq cannot be relied on because the BIOS may be
// exporintg an invalid p-state (on x86) or p-states may be used to put the
// processor in a new mode (turbo mode). Essentially, those frequencies
// cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
// well.
if (!saw_mhz &&
ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) {
// The value is in kHz (as the file name suggests). For example, on a
// 2GHz warpstation, the file contains the value "2000000".
cpuinfo_cycles_per_second = freq * 1000.0;
saw_mhz = true;
}
// If CPU scaling is in effect, we want to use the *maximum* frequency,
// not whatever CPU speed some random processor happens to be using now.
if (!saw_mhz &&
ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
&freq)) {
// The value is in kHz. For example, on a 2GHz warpstation, the file
// contains the value "2000000".
cpuinfo_cycles_per_second = freq * 1000.0;
saw_mhz = true;
}
// Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq.
const char* pname = "/proc/cpuinfo";
int fd = open(pname, O_RDONLY);
if (fd == -1) {
perror(pname);
if (!saw_mhz) {
cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
}
return; // TODO: use generic tester instead?
}
double bogo_clock = 1.0;
bool saw_bogo = false;
int max_cpu_id = 0;
int num_cpus = 0;
line[0] = line[1] = '\0';
int chars_read = 0;
do { // we'll exit when the last read didn't read anything
// Move the next line to the beginning of the buffer
const int oldlinelen = strlen(line);
if (sizeof(line) == oldlinelen + 1) // oldlinelen took up entire line
line[0] = '\0';
else // still other lines left to save
memmove(line, line + oldlinelen+1, sizeof(line) - (oldlinelen+1));
// Terminate the new line, reading more if we can't find the newline
char* newline = strchr(line, '\n');
if (newline == NULL) {
const int linelen = strlen(line);
const int bytes_to_read = sizeof(line)-1 - linelen;
CHECK(bytes_to_read > 0); // because the memmove recovered >=1 bytes
chars_read = read(fd, line + linelen, bytes_to_read);
line[linelen + chars_read] = '\0';
newline = strchr(line, '\n');
}
if (newline != NULL)
*newline = '\0';
// When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
// accept postive values. Some environments (virtual machines) report zero,
// which would cause infinite looping in WallTime_Init.
if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz")-1) == 0) {
const char* freqstr = strchr(line, ':');
if (freqstr) {
cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0;
if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
saw_mhz = true;
}
} else if (strncasecmp(line, "bogomips", sizeof("bogomips")-1) == 0) {
const char* freqstr = strchr(line, ':');
if (freqstr) {
bogo_clock = strtod(freqstr+1, &err) * 1000000.0;
if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0)
saw_bogo = true;
}
} else if (strncasecmp(line, "processor", sizeof("processor")-1) == 0) {
num_cpus++; // count up every time we see an "processor :" entry
const char* freqstr = strchr(line, ':');
if (freqstr) {
const int cpu_id = strtol(freqstr+1, &err, 10);
if (freqstr[1] != '\0' && *err == '\0' && max_cpu_id < cpu_id)
max_cpu_id = cpu_id;
}
}
} while (chars_read > 0);
close(fd);
if (!saw_mhz) {
if (saw_bogo) {
// If we didn't find anything better, we'll use bogomips, but
// we're not happy about it.
cpuinfo_cycles_per_second = bogo_clock;
} else {
// If we don't even have bogomips, we'll use the slow estimation.
cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
}
}
if (num_cpus == 0) {
fprintf(stderr, "Failed to read num. CPUs correctly from /proc/cpuinfo\n");
} else {
if ((max_cpu_id + 1) != num_cpus) {
fprintf(stderr,
"CPU ID assignments in /proc/cpuinfo seems messed up."
" This is usually caused by a bad BIOS.\n");
}
cpuinfo_num_cpus = num_cpus;
}
#elif defined OS_FREEBSD
// For this sysctl to work, the machine must be configured without
// SMP, APIC, or APM support. hz should be 64-bit in freebsd 7.0
// and later. Before that, it's a 32-bit quantity (and gives the
// wrong answer on machines faster than 2^32 Hz). See
// http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html
// But also compare FreeBSD 7.0:
// http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223
// 231 error = sysctl_handle_quad(oidp, &freq, 0, req);
// To FreeBSD 6.3 (it's the same in 6-STABLE):
// http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131
// 139 error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
#if __FreeBSD__ >= 7
uint64_t hz = 0;
#else
unsigned int hz = 0;
#endif
size_t sz = sizeof(hz);
const char *sysctl_path = "machdep.tsc_freq";
if ( sysctlbyname(sysctl_path, &hz, &sz, NULL, 0) != 0 ) {
fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
sysctl_path, strerror(errno));
cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
} else {
cpuinfo_cycles_per_second = hz;
}
// TODO: also figure out cpuinfo_num_cpus
#elif defined OS_WINDOWS
# pragma comment(lib, "shlwapi.lib") // for SHGetValue()
// In NT, read MHz from the registry. If we fail to do so or we're in win9x
// then make a crude estimate.
OSVERSIONINFO os;
os.dwOSVersionInfoSize = sizeof(os);
DWORD data, data_size = sizeof(data);
if (GetVersionEx(&os) &&
os.dwPlatformId == VER_PLATFORM_WIN32_NT &&
SUCCEEDED(SHGetValueA(HKEY_LOCAL_MACHINE,
"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
"~MHz", NULL, &data, &data_size)))
cpuinfo_cycles_per_second = (int64)data * (int64)(1000 * 1000); // was mhz
else
cpuinfo_cycles_per_second = EstimateCyclesPerSecond(500); // TODO <500?
// TODO: also figure out cpuinfo_num_cpus
#elif defined OS_MACOSX
// returning "mach time units" per second. the current number of elapsed
// mach time units can be found by calling uint64 mach_absolute_time();
// while not as precise as actual CPU cycles, it is accurate in the face
// of CPU frequency scaling and multi-cpu/core machines.
// Our mac users have these types of machines, and accuracy
// (i.e. correctness) trumps precision.
// See cycleclock.h: CycleClock::Now(), which returns number of mach time
// units on Mac OS X.
mach_timebase_info_data_t timebase_info;
mach_timebase_info(&timebase_info);
double mach_time_units_per_nanosecond =
static_cast<double>(timebase_info.denom) /
static_cast<double>(timebase_info.numer);
cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9;
int num_cpus = 0;
size_t size = sizeof(num_cpus);
int numcpus_name[] = { CTL_HW, HW_NCPU };
if (::sysctl(numcpus_name, arraysize(numcpus_name), &num_cpus, &size, 0, 0)
== 0
&& (size == sizeof(num_cpus)))
cpuinfo_num_cpus = num_cpus;
#else
// Generic cycles per second counter
cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
#endif
}
} // end namespace
#ifndef OS_WINDOWS
// getrusage() based implementation of MyCPUUsage
static double MyCPUUsageRUsage() {
struct rusage ru;
if (getrusage(RUSAGE_SELF, &ru) == 0) {
return (static_cast<double>(ru.ru_utime.tv_sec) +
static_cast<double>(ru.ru_utime.tv_usec)*1e-6 +
static_cast<double>(ru.ru_stime.tv_sec) +
static_cast<double>(ru.ru_stime.tv_usec)*1e-6);
} else {
return 0.0;
}
}
static bool MyCPUUsageCPUTimeNsLocked(double *cputime) {
static int cputime_fd = -1;
if (cputime_fd == -1) {
cputime_fd = open("/proc/self/cputime_ns", O_RDONLY);
if (cputime_fd < 0) {
cputime_fd = -1;
return false;
}
}
char buff[64];
memset(buff, 0, sizeof(buff));
if (pread(cputime_fd, buff, sizeof(buff)-1, 0) <= 0) {
close(cputime_fd);
cputime_fd = -1;
return false;
}
unsigned long long result = strtoull(buff, NULL, 0);
if (result == (std::numeric_limits<unsigned long long>::max)()) {
close(cputime_fd);
cputime_fd = -1;
return false;
}
*cputime = static_cast<double>(result) / 1e9;
return true;
}
double MyCPUUsage() {
{
mutex_lock l(&cputimens_mutex);
static bool use_cputime_ns = true;
if (use_cputime_ns) {
double value;
if (MyCPUUsageCPUTimeNsLocked(&value)) {
return value;
}
// Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
std::cout << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
use_cputime_ns = false;
}
}
return MyCPUUsageRUsage();
}
double ChildrenCPUUsage() {
struct rusage ru;
if (getrusage(RUSAGE_CHILDREN, &ru) == 0) {
return (static_cast<double>(ru.ru_utime.tv_sec) +
static_cast<double>(ru.ru_utime.tv_usec)*1e-6 +
static_cast<double>(ru.ru_stime.tv_sec) +
static_cast<double>(ru.ru_stime.tv_usec)*1e-6);
} else {
return 0.0;
}
}
#endif // OS_WINDOWS
double CyclesPerSecond(void) {
pthread_once(&cpuinfo_init, &InitializeSystemInfo);
return cpuinfo_cycles_per_second;
}
int NumCPUs(void) {
pthread_once(&cpuinfo_init, &InitializeSystemInfo);
return cpuinfo_num_cpus;
}
#ifndef BENCHMARK_SYSINFO_H_
#define BENCHMARK_SYSINFO_H_
double MyCPUUsage();
double ChildrenCPUUsage();
int NumCPUs();
double CyclesPerSecond();
#endif // BENCHMARK_SYSINFO_H_
#include "walltime.h"
#include <stdio.h>
#include <string.h>
#include <sys/time.h>
#include <time.h>
#include <atomic>
#include <limits>
#include "cycleclock.h"
#include "macros.h"
#include "sysinfo.h"
namespace walltime {
namespace {
const double kMaxErrorInterval = 100e-6;
std::atomic<bool> initialized(false);
WallTime base_walltime = 0.0;
int64_t base_cycletime = 0;
int64_t cycles_per_second;
double seconds_per_cycle;
uint32_t last_adjust_time = 0;
std::atomic<int32_t> drift_adjust(0);
int64_t max_interval_cycles = 0;
// Helper routines to load/store a float from an AtomicWord. Required because
// g++ < 4.7 doesn't support std::atomic<float> correctly. I cannot wait to get
// rid of this horror show.
inline void SetDrift(float f) {
int32_t w;
memcpy(&w, &f, sizeof(f));
std::atomic_store(&drift_adjust, w);
}
inline float GetDrift() {
float f;
int32_t w = std::atomic_load(&drift_adjust);
memcpy(&f, &w, sizeof(f));
return f;
}
static_assert(sizeof(float) <= sizeof(int32_t),
"type sizes don't allow the drift_adjust hack");
WallTime Slow() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
bool SplitTimezone(WallTime value, bool local, struct tm* t,
double* subsecond) {
memset(t, 0, sizeof(*t));
if ((value < 0) || (value > std::numeric_limits<time_t>::max())) {
*subsecond = 0.0;
return false;
}
const time_t whole_time = static_cast<time_t>(value);
*subsecond = value - whole_time;
if (local)
localtime_r(&whole_time, t);
else
gmtime_r(&whole_time, t);
return true;
}
} // end namespace
// This routine should be invoked to initialize walltime.
// It is not intended for general purpose use.
void Initialize() {
CHECK(!std::atomic_load(&initialized));
cycles_per_second = static_cast<int64_t>(CyclesPerSecond());
CHECK(cycles_per_second != 0);
seconds_per_cycle = 1.0 / cycles_per_second;
max_interval_cycles = static_cast<int64_t>(
cycles_per_second * kMaxErrorInterval);
do {
base_cycletime = CycleClock::Now();
base_walltime = Slow();
} while (CycleClock::Now() - base_cycletime > max_interval_cycles);
// We are now sure that "base_walltime" and "base_cycletime" were produced
// within kMaxErrorInterval of one another.
SetDrift(0.0);
last_adjust_time = static_cast<uint32_t>(uint64_t(base_cycletime) >> 32);
std::atomic_store(&initialized, true);
}
WallTime Now() {
if (!std::atomic_load(&initialized))
return Slow();
WallTime now = 0.0;
WallTime result = 0.0;
int64_t ct = 0;
uint32_t top_bits = 0;
do {
ct = CycleClock::Now();
int64_t cycle_delta = ct - base_cycletime;
result = base_walltime + cycle_delta * seconds_per_cycle;
top_bits = static_cast<uint32_t>(uint64_t(ct) >> 32);
// Recompute drift no more often than every 2^32 cycles.
// I.e., @2GHz, ~ every two seconds
if (top_bits == last_adjust_time) { // don't need to recompute drift
return result + GetDrift();
}
now = Slow();
} while (CycleClock::Now() - ct > max_interval_cycles);
// We are now sure that "now" and "result" were produced within
// kMaxErrorInterval of one another.
SetDrift(now - result);
last_adjust_time = top_bits;
return now;
}
const char* Print(WallTime time, const char *format, bool local,
char* storage, int *remainder_us) {
struct tm split;
double subsecond;
if (!SplitTimezone(time, local, &split, &subsecond)) {
snprintf(storage, sizeof(storage), "Invalid time: %f", time);
} else {
if (remainder_us != NULL) {
*remainder_us = static_cast<int>((subsecond * 1000000) + 0.5);
if (*remainder_us > 999999) *remainder_us = 999999;
if (*remainder_us < 0) *remainder_us = 0;
}
strftime(storage, sizeof(storage), format, &split);
}
return storage;
}
} // end namespace walltime
#ifndef BENCHMARK_WALLTIME_H_
#define BENCHMARK_WALLTIME_H_
typedef double WallTime;
namespace walltime {
void Initialize();
WallTime Now();
// GIVEN: walltime, generic format string (as understood by strftime),
// a boolean flag specifying if the time is local or UTC (true=local).
// RETURNS: the formatted string. ALSO RETURNS: the storage printbuffer
// passed and the remaining number of microseconds (never printed in
// the string since strftime does not understand it)
const char* Print(WallTime time, const char *format, bool local,
char* storage, int *remainder_us);
} // end namespace walltime
#endif // BENCHMARK_WALLTIME_H_
#include "benchmark/benchmark.h"
#include <math.h>
#include <stdint.h>
#include <limits>
#include <list>
#include <map>
#include <set>
#include <sstream>
#include <vector>
namespace {
int ATTRIBUTE_NOINLINE Factorial(uint32_t n) {
return (n == 1) ? 1 : n * Factorial(n - 1);
}
double CalculatePi(int depth) {
double pi = 0.0;
for (int i = 0; i < depth; ++i) {
double numerator = static_cast<double>(((i % 2) * 2) - 1);
double denominator = static_cast<double>((2 * i) - 1);
pi += numerator / denominator;
}
return (pi - 1.0) * 4;
}
std::set<int> ConstructRandomSet(int size) {
std::set<int> s;
for (int i = 0; i < size; ++i)
s.insert(i);
return s;
}
static std::vector<int>* test_vector = NULL;
} // end namespace
#ifdef DEBUG
static void BM_Factorial(benchmark::State& state) {
int fac_42 = 0;
while (state.KeepRunning())
fac_42 = Factorial(8);
// Prevent compiler optimizations
CHECK(fac_42 != std::numeric_limits<int>::max());
}
BENCHMARK(BM_Factorial);
#endif
static void BM_CalculatePiRange(benchmark::State& state) {
double pi = 0.0;
while (state.KeepRunning())
pi = CalculatePi(state.range_x());
std::stringstream ss;
ss << pi;
state.SetLabel(ss.str());
}
BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
static void BM_CalculatePi(benchmark::State& state) {
static const int depth = 1024;
double pi ATTRIBUTE_UNUSED = 0.0;
while (state.KeepRunning()) {
pi = CalculatePi(depth);
}
}
BENCHMARK(BM_CalculatePi)->Threads(8);
BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32);
BENCHMARK(BM_CalculatePi)->ThreadPerCpu();
static void BM_SetInsert(benchmark::State& state) {
while (state.KeepRunning()) {
state.PauseTiming();
std::set<int> data = ConstructRandomSet(state.range_x());
state.ResumeTiming();
for (int j = 0; j < state.range_y(); ++j)
data.insert(rand());
}
}
BENCHMARK(BM_SetInsert)->RangePair(1<<10,8<<10, 1,10);
template<typename Q>
static void BM_Sequential(benchmark::State& state) {
Q q;
typename Q::value_type v;
while (state.KeepRunning())
for (int i = state.range_x(); --i; )
q.push_back(v);
const int64_t items_processed =
static_cast<int64_t>(state.iterations()) * state.range_x();
state.SetItemsProcessed(items_processed);
state.SetBytesProcessed(items_processed * sizeof(v));
}
BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>)->Range(1 << 0, 1 << 10);
BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
static void BM_StringCompare(benchmark::State& state) {
std::string s1(state.range_x(), '-');
std::string s2(state.range_x(), '-');
int r = 0;
while (state.KeepRunning())
r |= s1.compare(s2);
// Prevent compiler optimizations
CHECK(r != std::numeric_limits<int>::max());
}
BENCHMARK(BM_StringCompare)->Range(1, 1<<20);
static void BM_SetupTeardown(benchmark::State& state) {
if (state.thread_index == 0)
test_vector = new std::vector<int>();
while (state.KeepRunning())
test_vector->push_back(0);
if (state.thread_index == 0) {
delete test_vector;
test_vector = NULL;
}
}
BENCHMARK(BM_SetupTeardown);
static void BM_LongTest(benchmark::State& state) {
double tracker = 0.0;
while (state.KeepRunning())
for (int i = 0; i < state.range_x(); ++i)
tracker += i;
CHECK(tracker != 0.0);
}
BENCHMARK(BM_LongTest)->Range(1<<16,1<<28);
int main(int argc, const char* argv[]) {
benchmark::Initialize(&argc, argv);
CHECK(Factorial(8) == 40320);
CHECK(CalculatePi(1) == 0.0);
benchmark::RunSpecifiedBenchmarks();
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment