Make `PauseTiming()` and `ResumeTiming()` per thread. (#286)

* Change to using per-thread timers * fix bad assertions * fix copy paste error on windows * Fix thread safety annotations * Make null-log thread safe * remove remaining globals * use chrono for walltime since it is thread safe * consolidate timer functions * Add missing ctime include * Rename to be consistent with Google style * Format patch using clang-format * cleanup -Wthread-safety configuration * Don't trust _POSIX_FEATURE macros because OS X lies. * Fix OS X thread timings * attempt to fix mingw build * Attempt to make mingw work again * Revert old mingw workaround * improve diagnostics * Drastically improve OS X measurements * Use average real time instead of max

Make `PauseTiming()` and `ResumeTiming()` per thread. (#286)
cba945e3 · Eric · GitHub · 94c2a30a · cba945e3 · cba945e3
Commit cba945e3 authored Sep 02, 2016 by Eric Committed by GitHub Sep 02, 2016
19 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,8 +87,7 @@ else()
    add_cxx_compiler_flag(-Wstrict-aliasing)
  endif()
  add_cxx_compiler_flag(-Wthread-safety)
-  if (HAVE_WTHREAD_SAFETY)
+  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
-    add_definitions(-DHAVE_WTHREAD_SAFETY)
    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
  endif()
@@ -152,7 +151,6 @@ cxx_feature_check(STD_REGEX)
 cxx_feature_check(GNU_POSIX_REGEX)
 cxx_feature_check(POSIX_REGEX)
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
 find_package(Threads REQUIRED)

--- a/include/benchmark/benchmark_api.h
+++ b/include/benchmark/benchmark_api.h
@@ -270,31 +270,25 @@ enum BigO {
 // computational complexity for the benchmark.
 typedef double(BigOFunc)(int);
+namespace internal {
+class ThreadTimer;
+class ThreadManager;
+}
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
 class State {
 public:
-  State(size_t max_iters, const std::vector<int>& ranges,
-        int thread_i, int n_threads);
  // Returns true if the benchmark should continue through another iteration.
  // NOTE: A benchmark may not return from the test until KeepRunning() has
  // returned false.
  bool KeepRunning() {
    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
-      assert(!finished_);
+      StartKeepRunning();
-      started_ = true;
-      ResumeTiming();
    }
    bool const res = total_iterations_++ < max_iterations;
    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
-      assert(started_ && (!finished_ || error_occurred_));
+      FinishKeepRunning();
-      if (!error_occurred_) {
-        PauseTiming();
-      }
-      // Total iterations now is one greater than max iterations. Fix this.
-      total_iterations_ = max_iterations;
-      finished_ = true;
    }
    return res;
  }
@@ -304,10 +298,11 @@ public:
  // Stop the benchmark timer.  If not called, the timer will be
  // automatically stopped after KeepRunning() returns false for the first time.
  //
-  // For threaded benchmarks the PauseTiming() function acts
+  // For threaded benchmarks the PauseTiming() function only pauses the timing
-  // like a barrier.  I.e., the ith call by a particular thread to this
+  // for the current thread.
-  // function will block until all active threads have made their ith call.
+  //
-  // The timer will stop when the last thread has called this function.
+  // NOTE: The "real time" measurement is per-thread. If different threads
+  // report different measurements the largest one is reported.
  //
  // NOTE: PauseTiming()/ResumeTiming() are relatively
  // heavyweight, and so their use should generally be avoided
@@ -319,11 +314,6 @@ public:
  // Start the benchmark timer.  The timer is NOT running on entrance to the
  // benchmark function. It begins running after the first call to KeepRunning()
  //
-  // For threaded benchmarks the ResumeTiming() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all active threads have made their ith call.
-  // The timer will start when the last thread has called this function.
-  //
  // NOTE: PauseTiming()/ResumeTiming() are relatively
  // heavyweight, and so their use should generally be avoided
  // within each benchmark iteration, if possible.
@@ -335,10 +325,10 @@ public:
  // thread and report an error with the specified 'msg'. After this call
  // the user may explicitly 'return' from the benchmark.
  //
-  // For threaded benchmarks only the current thread stops executing. If
+  // For threaded benchmarks only the current thread stops executing and future
-  // multiple threads report an error only the first error message is used.
+  // calls to `KeepRunning()` will block until all threads have completed
-  // The current thread is no longer considered 'active' by
+  // the `KeepRunning()` loop. If multiple threads report an error only the
-  // 'PauseTiming()' and 'ResumingTiming()'.
+  // first error message is used.
  //
  // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
  // the current scope immediately. If the function is called from within
@@ -351,10 +341,8 @@ public:
  // is used instead of automatically measured time if UseManualTime() was
  // specified.
  //
-  // For threaded benchmarks the SetIterationTime() function acts
+  // For threaded benchmarks the final value will be set to the largest
-  // like a barrier.  I.e., the ith call by a particular thread to this
+  // reported values.
-  // function will block until all threads have made their ith call.
-  // The time will be set by the last thread to call this function.
  void SetIterationTime(double seconds);
  // Set the number of bytes processed by the current benchmark
@@ -465,7 +453,16 @@ public:
  const int threads;
  const size_t max_iterations;
-private:
+  // TODO make me private
+  State(size_t max_iters, const std::vector<int>& ranges, int thread_i,
+        int n_threads, internal::ThreadTimer* timer,
+        internal::ThreadManager* manager);
+ private:
+  void StartKeepRunning();
+  void FinishKeepRunning();
+  internal::ThreadTimer* timer_;
+  internal::ThreadManager* manager_;
  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
 };

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,9 +8,9 @@ endif()
 # Define the source files
 set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc"
-                 "console_reporter.cc" "csv_reporter.cc" "json_reporter.cc"
+                 "console_reporter.cc" "csv_reporter.cc"
-                 "log.cc" "reporter.cc" "sleep.cc" "string_util.cc"
+                 "json_reporter.cc" "reporter.cc" "sleep.cc"
-                 "sysinfo.cc" "walltime.cc" "complexity.cc")
+                 "string_util.cc" "sysinfo.cc" "complexity.cc" "timers.cc")
 # Add headers to the list of source files. cmake does not require this,
 # but IDEs such as Visual Studio need this to add the headers
 # to the generated project.
@@ -19,8 +19,7 @@ list(APPEND SOURCE_FILES "${_d}/benchmark.h" "${_d}/benchmark_api.h"
            "${_d}/macros.h" "${_d}/reporter.h" "arraysize.h" "check.h"
            "colorprint.h" "commandlineflags.h" "complexity.h"
            "cycleclock.h" "internal_macros.h" "log.h" "mutex.h"
-            "re.h" "sleep.h" "stat.h" "string_util.h" "sysinfo.h"
+            "re.h" "sleep.h" "stat.h" "string_util.h" "sysinfo.h" "timers.h")
-            "walltime.h")
 unset(_d)
 # Determine the correct regular expression engine to use

--- a/src/benchmark.cc
+++ b/src/benchmark.cc
--- a/src/check.h
+++ b/src/check.h
@@ -33,9 +33,7 @@ public:
          << check << "' failed. ";
  }
-  std::ostream& GetLog() {
+  LogType& GetLog() { return log_; }
-    return log_;
-  }
  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
      log_ << std::endl;
@@ -46,7 +44,7 @@ public:
  CheckHandler(const CheckHandler&) = delete;
  CheckHandler() = delete;
 private:
-  std::ostream& log_;
+ LogType& log_;
 };
 } // end namespace internal

--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -28,7 +28,7 @@
 #include "commandlineflags.h"
 #include "internal_macros.h"
 #include "string_util.h"
-#include "walltime.h"
+#include "timers.h"
 namespace benchmark {

--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@@ -23,7 +23,7 @@
 #include <vector>
 #include "string_util.h"
-#include "walltime.h"
+#include "timers.h"
 // File format reference: http://edoceo.com/utilitas/csv-file-format.

--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -23,7 +23,7 @@
 #include <vector>
 #include "string_util.h"
-#include "walltime.h"
+#include "timers.h"
 namespace benchmark {

--- a/src/log.cc
+++ b/src/log.cc
-#include "log.h"
-#include <iostream>
-namespace benchmark {
-namespace internal {
-int& LoggingLevelImp() {
-    static int level = 0;
-    return level;
-}
-void SetLogLevel(int value) {
-    LoggingLevelImp() = value;
-}
-int GetLogLevel() {
-    return LoggingLevelImp();
-}
-class NullLogBuffer : public std::streambuf
-{
-public:
-  int overflow(int c) {
-    return c;
-  }
-};
-std::ostream& GetNullLogInstance() {
-  static NullLogBuffer log_buff;
-  static std::ostream null_log(&log_buff);
-  return null_log;
-}
-std::ostream& GetErrorLogInstance() {
-  return std::clog;
-}
-} // end namespace internal
-} // end namespace benchmark
\ No newline at end of file
--- a/src/log.h
+++ b/src/log.h
 #ifndef BENCHMARK_LOG_H_
 #define BENCHMARK_LOG_H_
+#include <iostream>
 #include <ostream>
+#include "benchmark/macros.h"
 namespace benchmark {
 namespace internal {
-int GetLogLevel();
+typedef std::basic_ostream<char>&(EndLType)(std::basic_ostream<char>&);
-void SetLogLevel(int level);
+class LogType {
+  friend LogType& GetNullLogInstance();
+  friend LogType& GetErrorLogInstance();
+  // FIXME: Add locking to output.
+  template <class Tp>
+  friend LogType& operator<<(LogType&, Tp const&);
+  friend LogType& operator<<(LogType&, EndLType*);
+ private:
+  LogType(std::ostream* out) : out_(out) {}
+  std::ostream* out_;
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+};
-std::ostream& GetNullLogInstance();
+template <class Tp>
-std::ostream& GetErrorLogInstance();
+LogType& operator<<(LogType& log, Tp const& value) {
+  if (log.out_) {
+    *log.out_ << value;
+  }
+  return log;
+}
+inline LogType& operator<<(LogType& log, EndLType* m) {
+  if (log.out_) {
+    *log.out_ << m;
+  }
+  return log;
+}
+inline int& LogLevel() {
+  static int log_level = 0;
+  return log_level;
+}
+inline LogType& GetNullLogInstance() {
+  static LogType log(nullptr);
+  return log;
+}
+inline LogType& GetErrorLogInstance() {
+  static LogType log(&std::clog);
+  return log;
+}
-inline std::ostream& GetLogInstanceForLevel(int level) {
+inline LogType& GetLogInstanceForLevel(int level) {
-  if (level <= GetLogLevel()) {
+  if (level <= LogLevel()) {
    return GetErrorLogInstance();
  }
  return GetNullLogInstance();

--- a/src/mutex.h
+++ b/src/mutex.h
@@ -4,6 +4,8 @@
 #include <mutex>
 #include <condition_variable>
+#include "check.h"
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
@@ -105,36 +107,58 @@ private:
  MutexLockImp ml_;
 };
+class Barrier {
+ public:
+  Barrier(int num_threads) : running_threads_(num_threads) {}
-class Notification
+  // Called by each thread
-{
+  bool wait() EXCLUDES(lock_) {
-public:
+    bool last_thread = false;
-  Notification() : notified_yet_(false) { }
-  void WaitForNotification() const EXCLUDES(mutex_) {
-    MutexLock m_lock(mutex_);
-    auto notified_fn = [this]() REQUIRES(mutex_) {
-                            return this->HasBeenNotified();
-                        };
-    cv_.wait(m_lock.native_handle(), notified_fn);
-  }
-  void Notify() EXCLUDES(mutex_) {
    {
-      MutexLock lock(mutex_);
+      MutexLock ml(lock_);
-      notified_yet_ = 1;
+      last_thread = createBarrier(ml);
    }
-    cv_.notify_all();
+    if (last_thread) phase_condition_.notify_all();
+    return last_thread;
  }
-private:
+  void removeThread() EXCLUDES(lock_) {
-  bool HasBeenNotified() const REQUIRES(mutex_) {
+    MutexLock ml(lock_);
-    return notified_yet_;
+    --running_threads_;
+    if (entered_ != 0) phase_condition_.notify_all();
  }
-  mutable Mutex mutex_;
+ private:
-  mutable std::condition_variable cv_;
+  Mutex lock_;
-  bool notified_yet_ GUARDED_BY(mutex_);
+  Condition phase_condition_;
+  int running_threads_;
+  // State for barrier management
+  int phase_number_ = 0;
+  int entered_ = 0;  // Number of threads that have entered this barrier
+  // Enter the barrier and wait until all other threads have also
+  // entered the barrier.  Returns iff this is the last thread to
+  // enter the barrier.
+  bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
+    CHECK_LT(entered_, running_threads_);
+    entered_++;
+    if (entered_ < running_threads_) {
+      // Wait for all threads to enter
+      int phase_number_cp = phase_number_;
+      auto cb = [this, phase_number_cp]() {
+        return this->phase_number_ > phase_number_cp ||
+               entered_ == running_threads_;  // A thread has aborted in error
+      };
+      phase_condition_.wait(ml.native_handle(), cb);
+      if (phase_number_ > phase_number_cp) return false;
+      // else (running_threads_ == entered_) and we are the last thread.
+    }
+    // Last thread has reached the barrier
+    phase_number_++;
+    entered_ = 0;
+    return true;
+  }
 };
 } // end namespace benchmark

--- a/src/reporter.cc
+++ b/src/reporter.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 #include "benchmark/reporter.h"
-#include "walltime.h"
+#include "timers.h"
 #include <cstdlib>

--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -52,7 +52,6 @@ namespace {
 std::once_flag cpuinfo_init;
 double cpuinfo_cycles_per_second = 1.0;
 int cpuinfo_num_cpus = 1;  // Conservative guess
-std::mutex cputimens_mutex;
 #if !defined BENCHMARK_OS_MACOSX
 const int64_t estimate_time_ms = 1000;
@@ -288,101 +287,8 @@ void InitializeSystemInfo() {
  cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
 #endif
 }
-}  // end namespace
-// getrusage() based implementation of MyCPUUsage
-static double MyCPUUsageRUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  struct rusage ru;
-  if (getrusage(RUSAGE_SELF, &ru) == 0) {
-    return (static_cast<double>(ru.ru_utime.tv_sec) +
-            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
-            static_cast<double>(ru.ru_stime.tv_sec) +
-            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
-  } else {
-    return 0.0;
-  }
-#else
-  HANDLE proc = GetCurrentProcess();
-  FILETIME creation_time;
-  FILETIME exit_time;
-  FILETIME kernel_time;
-  FILETIME user_time;
-  ULARGE_INTEGER kernel;
-  ULARGE_INTEGER user;
-  GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time, &user_time);
-  kernel.HighPart = kernel_time.dwHighDateTime;
-  kernel.LowPart = kernel_time.dwLowDateTime;
-  user.HighPart = user_time.dwHighDateTime;
-  user.LowPart = user_time.dwLowDateTime;
-  return (static_cast<double>(kernel.QuadPart) +
-          static_cast<double>(user.QuadPart)) * 1e-7;
-#endif  // OS_WINDOWS
-}
-#ifndef BENCHMARK_OS_WINDOWS
-static bool MyCPUUsageCPUTimeNsLocked(double* cputime) {
-  static int cputime_fd = -1;
-  if (cputime_fd == -1) {
-    cputime_fd = open("/proc/self/cputime_ns", O_RDONLY);
-    if (cputime_fd < 0) {
-      cputime_fd = -1;
-      return false;
-    }
-  }
-  char buff[64];
-  memset(buff, 0, sizeof(buff));
-  if (pread(cputime_fd, buff, sizeof(buff) - 1, 0) <= 0) {
-    close(cputime_fd);
-    cputime_fd = -1;
-    return false;
-  }
-  unsigned long long result = strtoull(buff, nullptr, 0);
-  if (result == (std::numeric_limits<unsigned long long>::max)()) {
-    close(cputime_fd);
-    cputime_fd = -1;
-    return false;
-  }
-  *cputime = static_cast<double>(result) / 1e9;
-  return true;
-}
-#endif  // OS_WINDOWS
-double MyCPUUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  {
-    std::lock_guard<std::mutex> l(cputimens_mutex);
-    static bool use_cputime_ns = true;
-    if (use_cputime_ns) {
-      double value;
-      if (MyCPUUsageCPUTimeNsLocked(&value)) {
-        return value;
-      }
-      // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
-      VLOG(1) << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
-      use_cputime_ns = false;
-    }
-  }
-#endif  // OS_WINDOWS
-  return MyCPUUsageRUsage();
-}
-double ChildrenCPUUsage() {
+}  // end namespace
-#ifndef BENCHMARK_OS_WINDOWS
-  struct rusage ru;
-  if (getrusage(RUSAGE_CHILDREN, &ru) == 0) {
-    return (static_cast<double>(ru.ru_utime.tv_sec) +
-            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
-            static_cast<double>(ru.ru_stime.tv_sec) +
-            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
-  } else {
-    return 0.0;
-  }
-#else
-  // TODO: Not sure what this even means on Windows
-  return 0.0;
-#endif  // OS_WINDOWS
-}
 double CyclesPerSecond(void) {
  std::call_once(cpuinfo_init, InitializeSystemInfo);

--- a/src/sysinfo.h
+++ b/src/sysinfo.h
@@ -2,8 +2,6 @@
 #define BENCHMARK_SYSINFO_H_
 namespace benchmark {
-double MyCPUUsage();
-double ChildrenCPUUsage();
 int NumCPUs();
 double CyclesPerSecond();
 bool CpuScalingEnabled();

--- a/src/timers.cc
+++ b/src/timers.cc
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "timers.h"
+#include "internal_macros.h"
+#ifdef BENCHMARK_OS_WINDOWS
+#include <Shlwapi.h>
+#include <VersionHelpers.h>
+#include <Windows.h>
+#else
+#include <fcntl.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
+#include <unistd.h>
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#include <sys/sysctl.h>
+#endif
+#if defined(BENCHMARK_OS_MACOSX)
+#include <mach/mach_init.h>
+#include <mach/mach_port.h>
+#include <mach/thread_act.h>
+#endif
+#endif
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <iostream>
+#include <limits>
+#include <mutex>
+#include "check.h"
+#include "log.h"
+#include "sleep.h"
+#include "string_util.h"
+namespace benchmark {
+// Suppress unused warnings on helper functions.
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+namespace {
+#if defined(BENCHMARK_OS_WINDOWS)
+double MakeTime(FILETIME const& kernel_time, FILETIME const& user_time) {
+  ULARGE_INTEGER kernel;
+  ULARGE_INTEGER user;
+  kernel.HighPart = kernel_time.dwHighDateTime;
+  kernel.LowPart = kernel_time.dwLowDateTime;
+  user.HighPart = user_time.dwHighDateTime;
+  user.LowPart = user_time.dwLowDateTime;
+  return (static_cast<double>(kernel.QuadPart) +
+          static_cast<double>(user.QuadPart)) *
+         1e-7;
+}
+#else
+double MakeTime(struct timespec const& ts) {
+  return ts.tv_sec + (static_cast<double>(ts.tv_nsec) * 1e-9);
+}
+double MakeTime(struct rusage ru) {
+  return (static_cast<double>(ru.ru_utime.tv_sec) +
+          static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
+          static_cast<double>(ru.ru_stime.tv_sec) +
+          static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
+}
+#endif
+#if defined(BENCHMARK_OS_MACOSX)
+double MakeTime(thread_basic_info_data_t const& info) {
+  return (static_cast<double>(info.user_time.seconds) +
+          static_cast<double>(info.user_time.microseconds) * 1e-6 +
+          static_cast<double>(info.system_time.seconds) +
+          static_cast<double>(info.user_time.microseconds) * 1e-6);
+}
+#endif
+BENCHMARK_NORETURN static void  DiagnoseAndExit(const char* msg) {
+    std::cerr << "ERROR: " << msg << std::endl;
+    std::exit(EXIT_FAILURE);
+}
+}  // end namespace
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+double ProcessCPUUsage() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  HANDLE proc = GetCurrentProcess();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  if (GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time, &user_time))
+    return MakeTime(kernel_time, user_time);
+  DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(CLOCK_PROCESS_CPUTIME_ID)
+  struct timespec spec;
+  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
+    return MakeTime(spec);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+#else
+  struct rusage ru;
+  if (getrusage(RUSAGE_SELF, &ru) == 0)
+    return MakeTime(ru);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+#endif
+}
+double ThreadCPUUsage() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  HANDLE this_thread = GetCurrentThread();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time,
+                 &user_time);
+  return MakeTime(kernel_time, user_time);
+#elif defined(CLOCK_THREAD_CPUTIME_ID)
+  struct timespec ts;
+  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0)
+    return MakeTime(ts);
+  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
+#elif defined(BENCHMARK_OS_MACOSX)
+  mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
+  thread_basic_info_data_t info;
+  mach_port_t thread = pthread_mach_thread_np(pthread_self());
+  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t) &info, &count)
+      == KERN_SUCCESS) {
+    return MakeTime(info);
+  }
+  DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
+#else
+#error Per-thread timing is not available on your system.
+#endif
+}
+namespace {
+std::string DateTimeString(bool local) {
+  typedef std::chrono::system_clock Clock;
+  std::time_t now = Clock::to_time_t(Clock::now());
+  const std::size_t kStorageSize = 128;
+  char storage[kStorageSize];
+  std::size_t written;
+  if (local) {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written =
+        std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
+#else
+    std::tm timeinfo;
+    std::memset(&timeinfo, 0, sizeof(std::tm));
+    ::localtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  } else {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
+#else
+    std::tm timeinfo;
+    std::memset(&timeinfo, 0, sizeof(std::tm));
+    ::gmtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  }
+  CHECK(written < kStorageSize);
+  ((void)written);  // prevent unused variable in optimized mode.
+  return std::string(storage);
+}
+}  // end namespace
+std::string LocalDateTimeString() { return DateTimeString(true); }
+}  // end namespace benchmark
--- a/src/timers.h
+++ b/src/timers.h
+#ifndef BENCHMARK_TIMERS_H
+#define BENCHMARK_TIMERS_H
+#include <chrono>
+#include <string>
+namespace benchmark {
+// Return the CPU usage of the current process
+double ProcessCPUUsage();
+// Return the CPU usage of the children of the current process
+double ChildrenCPUUsage();
+// Return the CPU usage of the current thread
+double ThreadCPUUsage();
+#if defined(HAVE_STEADY_CLOCK)
+template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
+struct ChooseSteadyClock {
+  typedef std::chrono::high_resolution_clock type;
+};
+template <>
+struct ChooseSteadyClock<false> {
+  typedef std::chrono::steady_clock type;
+};
+#endif
+struct ChooseClockType {
+#if defined(HAVE_STEADY_CLOCK)
+  typedef ChooseSteadyClock<>::type type;
+#else
+  typedef std::chrono::high_resolution_clock type;
+#endif
+};
+inline double ChronoClockNow() {
+  typedef ChooseClockType::type ClockType;
+  using FpSeconds = std::chrono::duration<double, std::chrono::seconds::period>;
+  return FpSeconds(ClockType::now().time_since_epoch()).count();
+}
+std::string LocalDateTimeString();
+}  // end namespace benchmark
+#endif  // BENCHMARK_TIMERS_H
--- a/src/walltime.cc
+++ b/src/walltime.cc
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "benchmark/macros.h"
-#include "internal_macros.h"
-#include "walltime.h"
-#if defined(BENCHMARK_OS_WINDOWS)
-#include <time.h>
-#include <winsock.h> // for timeval
-#else
-#include <sys/time.h>
-#endif
-#include <cstdio>
-#include <cstdint>
-#include <cstring>
-#include <ctime>
-#include <atomic>
-#include <chrono>
-#include <limits>
-#include "arraysize.h"
-#include "check.h"
-#include "cycleclock.h"
-#include "log.h"
-#include "sysinfo.h"
-namespace benchmark {
-namespace walltime {
-namespace {
-#if defined(HAVE_STEADY_CLOCK)
-template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
-struct ChooseSteadyClock {
-    typedef std::chrono::high_resolution_clock type;
-};
-template <>
-struct ChooseSteadyClock<false> {
-    typedef std::chrono::steady_clock type;
-};
-#endif
-struct ChooseClockType {
-#if defined(HAVE_STEADY_CLOCK)
-  typedef ChooseSteadyClock<>::type type;
-#else
-  typedef std::chrono::high_resolution_clock type;
-#endif
-};
-class WallTimeImp
-{
-public:
-  WallTime Now();
-  static WallTimeImp& GetWallTimeImp() {
-    static WallTimeImp* imp = new WallTimeImp();
-    return *imp;
-  }
-private:
-  WallTimeImp();
-  // Helper routines to load/store a float from an AtomicWord. Required because
-  // g++ < 4.7 doesn't support std::atomic<float> correctly. I cannot wait to
-  // get rid of this horror show.
-  void SetDrift(float f) {
-    int32_t w;
-    memcpy(&w, &f, sizeof(f));
-    std::atomic_store(&drift_adjust_, w);
-  }
-  float GetDrift() const {
-    float f;
-    int32_t w = std::atomic_load(&drift_adjust_);
-    memcpy(&f, &w, sizeof(f));
-    return f;
-  }
-  WallTime Slow() const {
-    struct timeval tv;
-#if defined(BENCHMARK_OS_WINDOWS)
-    FILETIME    file_time;
-    SYSTEMTIME  system_time;
-    ULARGE_INTEGER ularge;
-    const unsigned __int64 epoch = 116444736000000000LL;
-    GetSystemTime(&system_time);
-    SystemTimeToFileTime(&system_time, &file_time);
-    ularge.LowPart = file_time.dwLowDateTime;
-    ularge.HighPart = file_time.dwHighDateTime;
-    tv.tv_sec = (long)((ularge.QuadPart - epoch) / (10L * 1000 * 1000));
-    tv.tv_usec = (long)(system_time.wMilliseconds * 1000);
-#else
-    gettimeofday(&tv, nullptr);
-#endif
-    return tv.tv_sec + tv.tv_usec * 1e-6;
-  }
-private:
-  static_assert(sizeof(float) <= sizeof(int32_t),
-               "type sizes don't allow the drift_adjust hack");
-  WallTime base_walltime_;
-  int64_t base_cycletime_;
-  int64_t cycles_per_second_;
-  double seconds_per_cycle_;
-  uint32_t last_adjust_time_;
-  std::atomic<int32_t> drift_adjust_;
-  int64_t max_interval_cycles_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(WallTimeImp);
-};
-WallTime WallTimeImp::Now() {
-  WallTime now = 0.0;
-  WallTime result = 0.0;
-  int64_t ct = 0;
-  uint32_t top_bits = 0;
-  do {
-    ct = cycleclock::Now();
-    int64_t cycle_delta = ct - base_cycletime_;
-    result = base_walltime_ + cycle_delta * seconds_per_cycle_;
-    top_bits = static_cast<uint32_t>(uint64_t(ct) >> 32);
-    // Recompute drift no more often than every 2^32 cycles.
-    // I.e., @2GHz, ~ every two seconds
-    if (top_bits == last_adjust_time_) {  // don't need to recompute drift
-      return result + GetDrift();
-    }
-    now = Slow();
-  } while (cycleclock::Now() - ct > max_interval_cycles_);
-  // We are now sure that "now" and "result" were produced within
-  // kMaxErrorInterval of one another.
-  SetDrift(static_cast<float>(now - result));
-  last_adjust_time_ = top_bits;
-  return now;
-}
-WallTimeImp::WallTimeImp()
-    : base_walltime_(0.0), base_cycletime_(0),
-      cycles_per_second_(0), seconds_per_cycle_(0.0),
-      last_adjust_time_(0), drift_adjust_(0),
-      max_interval_cycles_(0) {
-  const double kMaxErrorInterval = 100e-6;
-  cycles_per_second_ = static_cast<int64_t>(CyclesPerSecond());
-  CHECK(cycles_per_second_ != 0);
-  seconds_per_cycle_ = 1.0 / cycles_per_second_;
-  max_interval_cycles_ =
-      static_cast<int64_t>(cycles_per_second_ * kMaxErrorInterval);
-  do {
-    base_cycletime_ = cycleclock::Now();
-    base_walltime_ = Slow();
-  } while (cycleclock::Now() - base_cycletime_ > max_interval_cycles_);
-  // We are now sure that "base_walltime" and "base_cycletime" were produced
-  // within kMaxErrorInterval of one another.
-  SetDrift(0.0);
-  last_adjust_time_ = static_cast<uint32_t>(uint64_t(base_cycletime_) >> 32);
-}
-WallTime CPUWalltimeNow() {
-  static WallTimeImp& imp = WallTimeImp::GetWallTimeImp();
-  return imp.Now();
-}
-WallTime ChronoWalltimeNow() {
-  typedef ChooseClockType::type Clock;
-  typedef std::chrono::duration<WallTime, std::chrono::seconds::period>
-          FPSeconds;
-  static_assert(std::chrono::treat_as_floating_point<WallTime>::value,
-                "This type must be treated as a floating point type.");
-  auto now = Clock::now().time_since_epoch();
-  return std::chrono::duration_cast<FPSeconds>(now).count();
-}
-bool UseCpuCycleClock() {
-    bool useWallTime = !CpuScalingEnabled();
-    if (useWallTime) {
-        VLOG(1) << "Using the CPU cycle clock to provide walltime::Now().\n";
-    } else {
-        VLOG(1) << "Using std::chrono to provide walltime::Now().\n";
-    }
-    return useWallTime;
-}
-} // end anonymous namespace
-// WallTimeImp doesn't work when CPU Scaling is enabled. If CPU Scaling is
-// enabled at the start of the program then std::chrono::system_clock is used
-// instead.
-WallTime Now()
-{
-  static bool useCPUClock = UseCpuCycleClock();
-  if (useCPUClock) {
-    return CPUWalltimeNow();
-  } else {
-    return ChronoWalltimeNow();
-  }
-}
-}  // end namespace walltime
-namespace {
-std::string DateTimeString(bool local) {
-  typedef std::chrono::system_clock Clock;
-  std::time_t now = Clock::to_time_t(Clock::now());
-  char storage[128];
-  std::size_t written;
-  if (local) {
-#if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
-#else
-    std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
-    ::localtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-#endif
-  } else {
-#if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
-#else
-    std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
-    ::gmtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-#endif
-  }
-  CHECK(written < arraysize(storage));
-  ((void)written); // prevent unused variable in optimized mode.
-  return std::string(storage);
-}
-} // end namespace
-std::string LocalDateTimeString() {
-  return DateTimeString(true);
-}
-}  // end namespace benchmark
--- a/src/walltime.h
+++ b/src/walltime.h
-#ifndef BENCHMARK_WALLTIME_H_
-#define BENCHMARK_WALLTIME_H_
-#include <string>
-namespace benchmark {
-typedef double WallTime;
-namespace walltime {
-WallTime Now();
-}  // end namespace walltime
-std::string LocalDateTimeString();
-}  // end namespace benchmark
-#endif  // BENCHMARK_WALLTIME_H_
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@@ -59,6 +59,10 @@ def calculate_change(old_val, new_val):
    """
    Return a float representing the decimal change between old_val and new_val.
    """
+    if old_val == 0 and new_val == 0:
+        return 0.0
+    if old_val == 0:
+        return float(new_val - old_val) / (float(old_val + new_val) / 2)
    return float(new_val - old_val) / abs(old_val)
@@ -73,7 +77,7 @@ def generate_difference_report(json1, json2, use_color=True):
            if b['name'] == name:
                return b
        return None
-    first_line = "{:<{}s}     Time           CPU".format(
+    first_line = "{:<{}s}     Time           CPU           Old           New".format(
        'Benchmark', first_col_width)
    output_strs = [first_line, '-' * len(first_line)]
    for bn in json1['benchmarks']:
@@ -88,12 +92,13 @@ def generate_difference_report(json1, json2, use_color=True):
                return BC_WHITE
            else:
                return BC_CYAN
-        fmt_str = "{}{:<{}s}{endc}    {}{:+.2f}{endc}         {}{:+.2f}{endc}"
+        fmt_str = "{}{:<{}s}{endc}    {}{:+.2f}{endc}         {}{:+.2f}{endc}         {:4d}         {:4d}"
        tres = calculate_change(bn['real_time'], other_bench['real_time'])
        cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
        output_strs += [color_format(use_color, fmt_str,
            BC_HEADER, bn['name'], first_col_width,
            get_color(tres), tres, get_color(cpures), cpures,
+            bn['cpu_time'], other_bench['cpu_time'],
            endc=BC_ENDC)]
    return output_strs