Squashed 'third_party/marl/' content from commit d3b8558ce

git-subtree-dir: third_party/marl git-subtree-split: d3b8558ce8d2cf2cad1009a99aa3ff453b048639

Squashed 'third_party/marl/' content from commit d3b8558ce
fe71eb9a · Ben Clayton · fe71eb9a · fe71eb9a · fe71eb9a · fe71eb9a
Commit fe71eb9a authored Sep 05, 2019 by Ben Clayton
64 changed files
--- a/.clang-format
+++ b/.clang-format
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+BasedOnStyle: Chromium
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
+/.vscode/
+/build/
\ No newline at end of file
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "third_party/googletest"]
+	path = third_party/googletest
+	url = https://github.com/google/googletest.git
--- a/AUTHORS
+++ b/AUTHORS
+# This is the list of the Marl authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+Google LLC
+Shawn Anastasio <shawn@anastas.io>
+A. Wilcox <awilfox@adelielinux.org>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+# Copyright 2019 The Marl Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 2.8)
+
+set (CMAKE_CXX_STANDARD 11)
+
+project(Marl C CXX ASM)
+
+###########################################################
+# Options
+###########################################################
+option(MARL_BUILD_EXAMPLES "Build example applications" OFF)
+option(MARL_BUILD_TESTS "Build tests" ON)
+option(MARL_ASAN "Build marl with address sanitizer" OFF)
+option(MARL_TSAN "Build marl with thread sanitizer" OFF)
+
+if (MARL_ASAN AND MARL_TSAN)
+    message(FATAL_ERROR "MARL_ASAN and MARL_TSAN are mutually exclusive")
+endif (MARL_ASAN AND MARL_TSAN)
+
+###########################################################
+# Directories
+###########################################################
+set(MARL_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+set(MARL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set(THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party)
+set(GOOGLETEST_DIR ${THIRD_PARTY_DIR}/googletest)
+
+###########################################################
+# Submodules
+###########################################################
+if(MARL_BUILD_TESTS)
+    if(NOT EXISTS ${THIRD_PARTY_DIR}/googletest/.git)
+        message(WARNING "third_party/googletest submodule missing.")
+        message(WARNING "Run: `git submodule update --init` to build tests.")
+        set(MARL_BUILD_TESTS OFF)
+    endif()
+endif(MARL_BUILD_TESTS)
+
+###########################################################
+# File lists
+###########################################################
+file(GLOB MARL_FULL_LIST
+    ${MARL_SRC_DIR}/*.cpp
+    ${MARL_SRC_DIR}/*.h
+    ${MARL_SRC_DIR}/*.c
+)
+
+if (NOT MSVC)
+    file(GLOB MARL_ASSEMBLY_LIST ${MARL_SRC_DIR}/*.S)
+    list(APPEND MARL_FULL_LIST ${MARL_ASSEMBLY_LIST})
+endif(NOT MSVC)
+
+set(MARL_LIST ${MARL_FULL_LIST})
+set(MARL_TEST_LIST ${MARL_FULL_LIST})
+list(FILTER MARL_LIST EXCLUDE REGEX ".*_test\\..*")
+list(FILTER MARL_TEST_LIST INCLUDE REGEX ".*_test\\..*")
+
+###########################################################
+# OS libraries
+###########################################################
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    set(MARL_OS_LIBS Kernel32)
+elseif(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    set(MARL_OS_LIBS pthread)
+elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+    set(MARL_OS_LIBS)
+endif()
+
+###########################################################
+# Targets
+###########################################################
+
+# marl
+add_library(marl STATIC ${MARL_LIST})
+set_target_properties(marl PROPERTIES
+    INCLUDE_DIRECTORIES "${MARL_INCLUDE_DIR}"
+    POSITION_INDEPENDENT_CODE 1
+)
+
+if (MARL_ASAN)
+    target_compile_options(marl PUBLIC "-fsanitize=address")
+    target_link_libraries(marl "-fsanitize=address")
+elseif (MARL_MSAN)
+    target_compile_options(marl PUBLIC "-fsanitize=memory")
+    target_link_libraries(marl "-fsanitize=memory")
+endif ()
+
+target_link_libraries(marl "${MARL_OS_LIBS}")
+
+# tests
+if(MARL_BUILD_TESTS)
+    file(GLOB MARL_TEST_LIST
+        ${MARL_SRC_DIR}/*_test.cpp
+        ${GOOGLETEST_DIR}/googletest/src/gtest-all.cc
+    )
+
+    set(MARL_TEST_INCLUDE_DIR
+        ${GOOGLETEST_DIR}/googletest/include/
+        ${GOOGLETEST_DIR}/googlemock/include/
+        ${GOOGLETEST_DIR}/googletest/
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
+    )
+
+    add_executable(marl-unittests ${MARL_TEST_LIST})
+
+    set_target_properties(marl-unittests PROPERTIES
+        INCLUDE_DIRECTORIES "${MARL_TEST_INCLUDE_DIR}"
+        FOLDER "Tests"
+    )
+    target_link_libraries(marl-unittests marl "${MARL_OS_LIBS}")
+endif(MARL_BUILD_TESTS)
+
+# examples
+if(MARL_BUILD_EXAMPLES)
+    function(BUILD_EXAMPLE name)
+        add_executable(${name} "${CMAKE_CURRENT_SOURCE_DIR}/examples/${name}.cpp")
+        set_target_properties(${name} PROPERTIES
+            INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/include"
+            FOLDER "Examples"
+        )
+        target_link_libraries(${name} marl "${MARL_OS_LIBS}")
+    endfunction(BUILD_EXAMPLE)
+
+    BUILD_EXAMPLE(fractal)
+
+endif(MARL_BUILD_EXAMPLES)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# Marl
+
+Marl is a hybrid thread / fiber task scheduler written in C++ 11.
+
+## About
+
+Marl is a C++ 11 library that provides a fluent interface for running tasks across a number of threads.
+
+Marl uses a combination of fibers and threads to allow efficient execution of tasks that can block, while keeping a fixed number of hardware threads.
+
+Marl supports Windows, macOS, Linux, Fuchsia and Android (arm, aarch64, ppc64 (ELFv2), x86 and x64).
+
+Marl has no dependencies on other libraries (with exception on googletest fo building the optional unit tests).
+
+Marl is in early development and will have breaking API changes.
+
+
+**More documentation and examples coming soon.**
+
+
+Note: This is not an officially supported Google product
--- a/examples/fractal.cpp
+++ b/examples/fractal.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This is an example application that uses Marl to parallelize the calculation of
+// a Julia fractal.
+
+#include <marl/defer.h>
+#include <marl/scheduler.h>
+#include <marl/thread.h>
+#include <marl/waitgroup.h>
+
+#include <fstream>
+
+#include <math.h>
+#include <stdint.h>
+
+// A color formed from a red, green and blue component.
+template <typename T>
+struct Color {
+  T r, g, b;
+
+  inline Color<T>& operator+=(const Color<T>& rhs) {
+    r += rhs.r;
+    g += rhs.g;
+    b += rhs.b;
+    return *this;
+  }
+
+  inline Color<T>& operator/=(T rhs) {
+    r /= rhs;
+    g /= rhs;
+    b /= rhs;
+    return *this;
+  }
+};
+
+// colorize returns a 'rainbow-color' for the scalar v.
+inline Color<float> colorize(float v) {
+  constexpr float PI = 3.141592653589793f;
+  constexpr float PI_2_THIRDS = 2.0f * PI / 3.0f;
+  return Color<float>{
+      0.5f + 0.5f * cosf(v + 0 * PI_2_THIRDS),
+      0.5f + 0.5f * cosf(v + 1 * PI_2_THIRDS),
+      0.5f + 0.5f * cosf(v + 2 * PI_2_THIRDS),
+  };
+}
+
+// lerp returns the linear interpolation between min and max using the weight x.
+inline float lerp(float x, float min, float max) {
+  return min + x * (max - min);
+}
+
+// julia calculates the Julia-set fractal value for the given coordinate and
+// constant. See https://en.wikipedia.org/wiki/Julia_set for more information.
+Color<float> julia(float x, float y, float cx, float cy) {
+  int iteration = 0;
+  for (int i = 0; i < 1000; i++) {
+    if (x * x + y * y > 4) {
+      return colorize(sqrt(i));
+    }
+
+    auto xtemp = x * x - y * y;
+    y = 2 * x * y + cy;
+    x = xtemp + cx;
+  }
+
+  return {};
+}
+
+// writeBMP writes the given image as a bitmap to the given file, returning
+// true on success and false on error.
+bool writeBMP(const Color<uint8_t>* texels,
+              int width,
+              int height,
+              const char* path) {
+  auto file = fopen(path, "wb");
+  if (!file) {
+    fprintf(stderr, "Could not open file '%s'\n", path);
+    return false;
+  }
+  defer(fclose(file));
+
+  bool ok = true;
+  auto put4 = [&](uint32_t val) { ok = ok && fwrite(&val, 1, 4, file) == 4; };
+  auto put2 = [&](uint16_t val) { ok = ok && fwrite(&val, 1, 2, file) == 2; };
+  auto put1 = [&](uint8_t val) { ok = ok && fwrite(&val, 1, 1, file) == 1; };
+
+  const uint32_t padding = -(3 * width) & 3U;   // in bytes
+  const uint32_t stride = 3 * width + padding;  // in bytes
+  const uint32_t offset = 54;
+  const uint32_t size = offset + stride * height * 3;
+
+  // Bitmap file header
+  put1('B');  // header field
+  put1('M');
+  put4(offset + stride * height * 3);  // size in bytes
+  put4(0);                             // reserved
+  put4(offset);
+
+  // BITMAPINFOHEADER
+  put4(40);      // size of header in bytes
+  put4(width);   // width in pixels
+  put4(height);  // height in pixels
+  put2(1);       // number of color planes
+  put2(24);      // bits per pixel
+  put4(0);       // compression scheme (none)
+  put4(0);       // size
+  put4(72);      // horizontal resolution
+  put4(72);      // vertical resolution
+  put4(0);       // color pallete size
+  put4(0);       // 'important colors' count
+
+  for (int y = height - 1; y >= 0; y--) {
+    for (int x = 0; x < width; x++) {
+      auto& texel = texels[x + y * width];
+      put1(texel.b);
+      put1(texel.g);
+      put1(texel.r);
+    }
+    for (int i = 0; i < padding; i++) {
+      put1(0);
+    }
+  }
+
+  return ok;
+}
+
+// Constants used for rendering the fractal.
+constexpr uint32_t imageWidth = 2048;
+constexpr uint32_t imageHeight = 2048;
+constexpr int samplesPerPixel = 8;
+constexpr float windowMinX = -0.5f;
+constexpr float windowMaxX = +0.5f;
+constexpr float windowMinY = -0.5f;
+constexpr float windowMaxY = +0.5f;
+constexpr float cx = -0.8f;
+constexpr float cy = 0.156f;
+
+int main(int argc, const char** argv) {
+  // Create a marl scheduler using the full number of logical cpus.
+  // Bind this scheduler to the main thread so we can call marl::schedule()
+  marl::Scheduler scheduler;
+  scheduler.setWorkerThreadCount(marl::Thread::numLogicalCPUs());
+  scheduler.bind();
+  defer(scheduler.unbind());  // unbind before destructing the scheduler.
+
+  // Allocate the image.
+  auto pixels = new Color<uint8_t>[imageWidth * imageHeight];
+  defer(delete[] pixels);  // free memory before returning.
+
+  // Create a wait group that will be used to synchronize the tasks.
+  // The wait group is constructed with an initial count of imageHeight as
+  // there will be a total of imageHeight tasks.
+  marl::WaitGroup wg(imageHeight);
+
+  // For each line of the image...
+  for (int y = 0; y < imageHeight; y++) {
+    // Schedule a task to calculate the image for this line.
+    // These may run concurrently across hardware threads.
+    marl::schedule([=] {
+      // Before this task returns, decrement the wait group counter.
+      // This is used to indicate that the task is done.
+      defer(wg.done());
+
+      for (int x = 0; x < imageWidth; x++) {
+        // Calculate the fractal pixel color.
+        Color<float> color = {};
+        for (int sample = 0; sample < samplesPerPixel; sample++) {
+          auto fx = float(x) + (rand() / float(RAND_MAX));
+          auto fy = float(y) + (rand() / float(RAND_MAX));
+          auto dx = float(fx) / float(imageWidth);
+          auto dy = float(fy) / float(imageHeight);
+          color += julia(lerp(dx, windowMinX, windowMaxX),
+                         lerp(dy, windowMinY, windowMaxY), cx, cy);
+        }
+        color /= samplesPerPixel;
+        pixels[x + y * imageWidth] = {static_cast<uint8_t>(color.r * 255),
+                                      static_cast<uint8_t>(color.g * 255),
+                                      static_cast<uint8_t>(color.b * 255)};
+      }
+    });
+  }
+
+  // Wait until all image lines have been calculated.
+  wg.wait();
+
+  // Write the image to "fractal.bmp".
+  if (!writeBMP(pixels, imageWidth, imageHeight, "fractal.bmp")) {
+    return 1;
+  }
+
+  // All done.
+  return 0;
+}
--- a/include/marl/blockingcall.h
+++ b/include/marl/blockingcall.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "defer.h"
+#include "waitgroup.h"
+
+#include <thread>
+#include <type_traits>
+
+namespace marl {
+namespace detail {
+
+template <typename RETURN_TYPE>
+class OnNewThread {
+ public:
+  template <typename F, typename... Args>
+  inline static RETURN_TYPE call(F&& f, Args&&... args) {
+    RETURN_TYPE result;
+    WaitGroup wg(1);
+    auto thread = std::thread([&] {
+      defer(wg.done());
+      result = f(args...);
+    });
+    wg.wait();
+    thread.join();
+    return result;
+  }
+};
+
+template <>
+class OnNewThread<void> {
+ public:
+  template <typename F, typename... Args>
+  inline static void call(F&& f, Args&&... args) {
+    WaitGroup wg(1);
+    auto thread = std::thread([&] {
+      defer(wg.done());
+      f(args...);
+    });
+    wg.wait();
+    thread.join();
+  }
+};
+
+}  // namespace detail
+
+// blocking_call() calls the function F on a new thread, yielding this fiber
+// to execute other tasks until F has returned.
+//
+// Example:
+//
+//  void runABlockingFunctionOnATask()
+//  {
+//      // Schedule a task that calls a blocking, non-yielding function.
+//      yarn::schedule([=] {
+//          // call_blocking_function() may block indefinitely.
+//          // Ensure this call does not block other tasks from running.
+//          auto result = yarn::blocking_call(call_blocking_function);
+//          // call_blocking_function() has now returned.
+//          // result holds the return value of the blocking function call.
+//      });
+//  }
+template <typename F, typename... Args>
+auto inline blocking_call(F&& f, Args&&... args) -> decltype(f(args...)) {
+  return detail::OnNewThread<decltype(f(args...))>::call(
+      std::forward<F>(f), std::forward<Args>(args)...);
+}
+
+}  // namespace marl
--- a/include/marl/conditionvariable.h
+++ b/include/marl/conditionvariable.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_condition_variable_h
+#define marl_condition_variable_h
+
+#include "containers.h"
+#include "debug.h"
+#include "scheduler.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
+namespace marl {
+
+// ConditionVariable is a synchronization primitive that can be used to block
+// one or more fibers or threads, until another fiber or thread modifies a
+// shared variable (the condition) and notifies the ConditionVariable.
+//
+// If the ConditionVariable is blocked on a thread with a Scheduler bound, the
+// thread will work on other tasks until the ConditionVariable is unblocked.
+class ConditionVariable {
+ public:
+  // Notifies and potentially unblocks one waiting fiber or thread.
+  inline void notify_one();
+
+  // Notifies and potentially unblocks all waiting fibers and/or threads.
+  inline void notify_all();
+
+  // Blocks the current fiber or thread until the predicate is satisfied
+  // and the ConditionVariable is notified.
+  template <typename Predicate>
+  inline void wait(std::unique_lock<std::mutex>& lock, Predicate pred);
+
+ private:
+  std::mutex mutex;
+  containers::vector<Scheduler::Fiber*, 4> waiting;
+  std::condition_variable condition;
+  std::atomic<int> numWaiting = {0};
+  std::atomic<int> numWaitingOnCondition = {0};
+};
+
+void ConditionVariable::notify_one() {
+  if (numWaiting == 0) {
+    return;
+  }
+  std::unique_lock<std::mutex> lock(mutex);
+  if (waiting.size() > 0) {
+    auto fiber = waiting.back();
+    waiting.pop_back();
+    fiber->schedule();
+  }
+  lock.unlock();
+  if (numWaitingOnCondition > 0) {
+    condition.notify_one();
+  }
+}
+
+void ConditionVariable::notify_all() {
+  if (numWaiting == 0) {
+    return;
+  }
+  std::unique_lock<std::mutex> lock(mutex);
+  while (waiting.size() > 0) {
+    auto fiber = waiting.back();
+    waiting.pop_back();
+    fiber->schedule();
+  }
+  lock.unlock();
+  if (numWaitingOnCondition > 0) {
+    condition.notify_all();
+  }
+}
+
+template <typename Predicate>
+void ConditionVariable::wait(std::unique_lock<std::mutex>& dataLock,
+                             Predicate pred) {
+  if (pred()) {
+    return;
+  }
+  numWaiting++;
+  if (auto fiber = Scheduler::Fiber::current()) {
+    // Currently executing on a scheduler fiber.
+    // Yield to let other tasks run that can unblock this fiber.
+    while (!pred()) {
+      mutex.lock();
+      waiting.push_back(fiber);
+      mutex.unlock();
+
+      dataLock.unlock();
+      fiber->yield();
+      dataLock.lock();
+    }
+  } else {
+    // Currently running outside of the scheduler.
+    // Delegate to the std::condition_variable.
+    numWaitingOnCondition++;
+    condition.wait(dataLock, pred);
+    numWaitingOnCondition--;
+  }
+  numWaiting--;
+}
+
+}  // namespace marl
+
+#endif  // marl_condition_variable_h
--- a/include/marl/containers.h
+++ b/include/marl/containers.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_containers_h
+#define marl_containers_h
+
+#include "debug.h"
+
+#include <algorithm>    // std::max
+#include <type_traits>  // std::aligned_storage
+#include <utility>      // std::move
+
+#include <cstddef>  // size_t
+
+namespace marl {
+namespace containers {
+
+////////////////////////////////////////////////////////////////////////////////
+// vector<T, BASE_CAPACITY>
+////////////////////////////////////////////////////////////////////////////////
+
+// vector is a container of contiguously stored elements.
+// Unlike std::vector, marl::containers::vector keeps the first BASE_CAPACITY
+// elements internally, which will avoid dynamic heap allocations.
+// Once the vector exceeds BASE_CAPACITY elements, vector will allocate storage
+// from the heap.
+template <typename T, int BASE_CAPACITY>
+class vector {
+ public:
+  inline vector() = default;
+
+  template <int BASE_CAPACITY_2>
+  inline vector(const vector<T, BASE_CAPACITY_2>& other);
+
+  template <int BASE_CAPACITY_2>
+  inline vector(vector<T, BASE_CAPACITY_2>&& other);
+
+  inline ~vector();
+
+  template <int BASE_CAPACITY_2>
+  inline vector<T, BASE_CAPACITY>& operator=(const vector<T, BASE_CAPACITY_2>&);
+
+  template <int BASE_CAPACITY_2>
+  inline vector<T, BASE_CAPACITY>& operator=(vector<T, BASE_CAPACITY_2>&&);
+
+  inline void push_back(const T& el);
+  inline void emplace_back(T&& el);
+  inline void pop_back();
+  inline T& front();
+  inline T& back();
+  inline T* begin();
+  inline T* end();
+  inline T& operator[](size_t i);
+  inline const T& operator[](size_t i) const;
+  inline size_t size() const;
+  inline size_t cap() const;
+  inline void resize(size_t n);
+  inline void reserve(size_t n);
+
+ private:
+  using TStorage = typename std::aligned_storage<sizeof(T), alignof(T)>::type;
+
+  inline void free();
+
+  size_t count = 0;
+  size_t capacity = BASE_CAPACITY;
+  TStorage buffer[BASE_CAPACITY];
+  TStorage* elements = buffer;
+};
+
+template <typename T, int BASE_CAPACITY>
+template <int BASE_CAPACITY_2>
+vector<T, BASE_CAPACITY>::vector(const vector<T, BASE_CAPACITY_2>& other) {
+  *this = other;
+}
+
+template <typename T, int BASE_CAPACITY>
+template <int BASE_CAPACITY_2>
+vector<T, BASE_CAPACITY>::vector(vector<T, BASE_CAPACITY_2>&& other) {
+  *this = std::move(other);
+}
+
+template <typename T, int BASE_CAPACITY>
+vector<T, BASE_CAPACITY>::~vector() {
+  free();
+}
+
+template <typename T, int BASE_CAPACITY>
+template <int BASE_CAPACITY_2>
+vector<T, BASE_CAPACITY>& vector<T, BASE_CAPACITY>::operator=(
+    const vector<T, BASE_CAPACITY_2>& other) {
+  free();
+  reserve(other.size());
+  count = other.size();
+  for (size_t i = 0; i < count; i++) {
+    new (&reinterpret_cast<T*>(elements)[i]) T(other[i]);
+  }
+  return *this;
+}
+
+template <typename T, int BASE_CAPACITY>
+template <int BASE_CAPACITY_2>
+vector<T, BASE_CAPACITY>& vector<T, BASE_CAPACITY>::operator=(
+    vector<T, BASE_CAPACITY_2>&& other) {
+  free();
+  reserve(other.size());
+  count = other.size();
+  for (size_t i = 0; i < count; i++) {
+    new (&reinterpret_cast<T*>(elements)[i]) T(std::move(other[i]));
+  }
+  other.resize(0);
+  return *this;
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::push_back(const T& el) {
+  reserve(count + 1);
+  new (&reinterpret_cast<T*>(elements)[count]) T(el);
+  count++;
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::emplace_back(T&& el) {
+  reserve(count + 1);
+  new (&reinterpret_cast<T*>(elements)[count]) T(std::move(el));
+  count++;
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::pop_back() {
+  MARL_ASSERT(count > 0, "pop_back() called on empty vector");
+  count--;
+  reinterpret_cast<T*>(elements)[count].~T();
+}
+
+template <typename T, int BASE_CAPACITY>
+T& vector<T, BASE_CAPACITY>::front() {
+  MARL_ASSERT(count > 0, "front() called on empty vector");
+  return reinterpret_cast<T*>(elements)[0];
+}
+
+template <typename T, int BASE_CAPACITY>
+T& vector<T, BASE_CAPACITY>::back() {
+  MARL_ASSERT(count > 0, "back() called on empty vector");
+  return reinterpret_cast<T*>(elements)[count - 1];
+}
+
+template <typename T, int BASE_CAPACITY>
+T* vector<T, BASE_CAPACITY>::begin() {
+  return reinterpret_cast<T*>(elements);
+}
+
+template <typename T, int BASE_CAPACITY>
+T* vector<T, BASE_CAPACITY>::end() {
+  return reinterpret_cast<T*>(elements) + count;
+}
+
+template <typename T, int BASE_CAPACITY>
+T& vector<T, BASE_CAPACITY>::operator[](size_t i) {
+  MARL_ASSERT(i < count, "index %d exceeds vector size %d", int(i), int(count));
+  return reinterpret_cast<T*>(elements)[i];
+}
+
+template <typename T, int BASE_CAPACITY>
+const T& vector<T, BASE_CAPACITY>::operator[](size_t i) const {
+  MARL_ASSERT(i < count, "index %d exceeds vector size %d", int(i), int(count));
+  return reinterpret_cast<T*>(elements)[i];
+}
+
+template <typename T, int BASE_CAPACITY>
+size_t vector<T, BASE_CAPACITY>::size() const {
+  return count;
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::resize(size_t n) {
+  reserve(n);
+  while (count < n) {
+    new (&reinterpret_cast<T*>(elements)[count++]) T();
+  }
+  while (n < count) {
+    reinterpret_cast<T*>(elements)[--count].~T();
+  }
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::reserve(size_t n) {
+  if (n > capacity) {
+    capacity = std::max<size_t>(n * 2, 8);
+    auto grown = new TStorage[capacity];
+    for (size_t i = 0; i < count; i++) {
+      new (&reinterpret_cast<T*>(grown)[i])
+          T(std::move(reinterpret_cast<T*>(elements)[i]));
+    }
+    free();
+    elements = grown;
+  }
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::free() {
+  for (size_t i = 0; i < count; i++) {
+    reinterpret_cast<T*>(elements)[i].~T();
+  }
+
+  if (elements != buffer) {
+    delete[] elements;
+    elements = nullptr;
+  }
+}
+
+}  // namespace containers
+}  // namespace marl
+
+#endif  // marl_containers_h
--- a/include/marl/debug.h
+++ b/include/marl/debug.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_debug_h
+#define marl_debug_h
+
+#if !defined(MARL_DEBUG_ENABLED)
+#if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
+#define MARL_DEBUG_ENABLED 1
+#else
+#define MARL_DEBUG_ENABLED 0
+#endif
+#endif
+
+namespace marl {
+
+void fatal(const char* msg, ...);
+void assert_has_bound_scheduler(const char* feature);
+
+#if MARL_DEBUG_ENABLED
+#define MARL_FATAL(msg, ...) marl::fatal(msg "\n", ##__VA_ARGS__);
+#define MARL_ASSERT(cond, msg, ...)              \
+  do {                                           \
+    if (!(cond)) {                               \
+      MARL_FATAL("ASSERT: " msg, ##__VA_ARGS__); \
+    }                                            \
+  } while (false);
+#define MARL_ASSERT_HAS_BOUND_SCHEDULER(feature) \
+  assert_has_bound_scheduler(feature);
+#define MARL_UNREACHABLE() MARL_FATAL("UNREACHABLE");
+#else
+#define MARL_FATAL(msg, ...)
+#define MARL_ASSERT(cond, msg, ...)
+#define MARL_ASSERT_HAS_BOUND_SCHEDULER(feature)
+#define MARL_UNREACHABLE()
+#endif
+
+}  // namespace marl
+
+#endif  // marl_debug_h
--- a/include/marl/defer.h
+++ b/include/marl/defer.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_defer_h
+#define marl_defer_h
+
+#include "finally.h"
+
+namespace marl {
+
+#define MARL_CONCAT_(a, b) a##b
+#define MARL_CONCAT(a, b) MARL_CONCAT_(a, b)
+
+// defer() is a macro to defer execution of a statement until the surrounding
+// scope is closed and is typically used to perform cleanup logic once a
+// function returns.
+//
+// Note: Unlike golang's defer(), the defer statement is executed when the
+// surrounding *scope* is closed, not necessarily the function.
+//
+// Example usage:
+//
+//  void sayHelloWorld()
+//  {
+//      defer(printf("world\n"));
+//      printf("hello ");
+//  }
+//
+#define defer(x) \
+  auto MARL_CONCAT(defer_, __LINE__) = marl::make_finally([&] { x; })
+
+}  // namespace marl
+
+#endif  // marl_defer_h
--- a/include/marl/finally.h
+++ b/include/marl/finally.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Finally can be used to execute a lambda or function when the final reference
+// to the Finally is dropped.
+//
+// The purpose of a finally is to perform cleanup or termination logic and is
+// especially useful when there are multiple early returns within a function.
+//
+// A moveable Finally can be constructed with marl::make_finally().
+// A sharable Finally can be constructed with marl::make_shared_finally().
+
+#ifndef marl_finally_h
+#define marl_finally_h
+
+#include <functional>
+#include <memory>
+
+namespace marl {
+
+// Finally is a pure virtual base class, implemented by the templated
+// FinallyImpl.
+class Finally {
+ public:
+  virtual ~Finally() = default;
+};
+
+// FinallyImpl implements a Finally.
+// The template parameter F is the function type to be called when the finally
+// is destructed. F must have the signature void().
+template <typename F>
+class FinallyImpl : public Finally {
+ public:
+  inline FinallyImpl(const F& func);
+  inline FinallyImpl(F&& func);
+  inline FinallyImpl(FinallyImpl<F>&& other);
+  inline ~FinallyImpl();
+
+ private:
+  FinallyImpl(const FinallyImpl<F>& other) = delete;
+  FinallyImpl<F>& operator=(const FinallyImpl<F>& other) = delete;
+  FinallyImpl<F>& operator=(FinallyImpl<F>&&) = delete;
+  F func;
+  bool valid = true;
+};
+
+template <typename F>
+FinallyImpl<F>::FinallyImpl(const F& func) : func(func) {}
+
+template <typename F>
+FinallyImpl<F>::FinallyImpl(F&& func) : func(std::move(func)) {}
+
+template <typename F>
+FinallyImpl<F>::FinallyImpl(FinallyImpl<F>&& other)
+    : func(std::move(other.func)) {
+  other.valid = false;
+}
+
+template <typename F>
+FinallyImpl<F>::~FinallyImpl() {
+  if (valid) {
+    func();
+  }
+}
+
+template <typename F>
+inline FinallyImpl<F> make_finally(F&& f) {
+  return FinallyImpl<F>(std::move(f));
+}
+
+template <typename F>
+inline std::shared_ptr<Finally> make_shared_finally(F&& f) {
+  return std::make_shared<FinallyImpl<F>>(std::move(f));
+}
+
+}  // namespace marl
+
+#endif  // marl_finally_h
--- a/include/marl/pool.h
+++ b/include/marl/pool.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_pool_h
+#define marl_pool_h
+
+#include "conditionvariable.h"
+
+#include <atomic>
+#include <mutex>
+
+namespace marl {
+
+// PoolPolicy controls whether pool items are constructed and destructed each
+// time they are borrowed from and returned to a pool, or whether they persist
+// constructed for the lifetime of the pool.
+enum class PoolPolicy {
+  // Call the Pool items constructor on borrow(), and destruct the item
+  // when the item is returned.
+  Reconstruct,
+
+  // Construct and destruct all items once for the lifetime of the Pool.
+  // Items will keep their state between loans.
+  Preserve,
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Pool<T>
+////////////////////////////////////////////////////////////////////////////////
+
+// Pool is the abstract base class for BoundedPool<> and UnboundedPool<>.
+template <typename T>
+class Pool {
+ protected:
+  struct Item;
+  class Storage;
+
+ public:
+  // A Loan is returned by the pool's borrow() function.
+  // Loans track the number of references to the loaned item, and return the
+  // item to the pool when the final Loan reference is dropped.
+  class Loan {
+   public:
+    inline Loan() = default;
+    inline Loan(Item*, const std::shared_ptr<Storage>&);
+    inline Loan(const Loan&);
+    inline Loan(Loan&&);
+    inline ~Loan();
+    inline Loan& operator=(const Loan&);
+    inline Loan& operator=(Loan&&);
+    inline T& operator*();
+    inline T* operator->() const;
+    inline T* get() const;
+    void reset();
+
+   private:
+    Item* item = nullptr;
+    std::shared_ptr<Storage> storage;
+  };
+
+ protected:
+  Pool() = default;
+
+  // The shared storage between the pool and all loans.
+  class Storage {
+   public:
+    virtual ~Storage() = default;
+    virtual void return_(Item*) = 0;
+  };
+
+  // The backing data of a single item in the pool.
+  struct Item {
+    // get() returns a pointer to the item's data.
+    inline T* get();
+
+    // construct() calls the constructor on the item's data.
+    inline void construct();
+
+    // destruct() calls the destructor on the item's data.
+    inline void destruct();
+
+    using Data = typename std::aligned_storage<sizeof(T), alignof(T)>::type;
+    Data data;
+    std::atomic<int> refcount = {0};
+    Item* next = nullptr;  // pointer to the next free item in the pool.
+  };
+};
+
+// Loan<T> is an alias to Pool<T>::Loan.
+template <typename T>
+using Loan = typename Pool<T>::Loan;
+
+////////////////////////////////////////////////////////////////////////////////
+// Pool<T>::Item
+////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+T* Pool<T>::Item::get() {
+  return reinterpret_cast<T*>(&data);
+}
+
+template <typename T>
+void Pool<T>::Item::construct() {
+  new (&data) T;
+}
+
+template <typename T>
+void Pool<T>::Item::destruct() {
+  get()->~T();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Pool<T>::Loan
+////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+Pool<T>::Loan::Loan(Item* item, const std::shared_ptr<Storage>& storage)
+    : item(item), storage(storage) {
+  item->refcount++;
+}
+
+template <typename T>
+Pool<T>::Loan::Loan(const Loan& other)
+    : item(other.item), storage(other.storage) {
+  if (item != nullptr) {
+    item->refcount++;
+  }
+}
+
+template <typename T>
+Pool<T>::Loan::Loan(Loan&& other) : item(other.item), storage(other.storage) {
+  other.item = nullptr;
+  other.storage = nullptr;
+}
+
+template <typename T>
+Pool<T>::Loan::~Loan() {
+  reset();
+}
+
+template <typename T>
+void Pool<T>::Loan::reset() {
+  if (item != nullptr) {
+    auto refs = --item->refcount;
+    MARL_ASSERT(refs >= 0, "reset() called on zero-ref pool item");
+    if (refs == 0) {
+      storage->return_(item);
+    }
+    item = nullptr;
+    storage = nullptr;
+  }
+}
+
+template <typename T>
+typename Pool<T>::Loan& Pool<T>::Loan::operator=(const Pool<T>::Loan& rhs) {
+  reset();
+  if (rhs.item != nullptr) {
+    item = rhs.item;
+    storage = rhs.storage;
+    rhs.item->refcount++;
+  }
+  return *this;
+}
+
+template <typename T>
+typename Pool<T>::Loan& Pool<T>::Loan::operator=(Pool<T>::Loan&& rhs) {
+  reset();
+  std::swap(item, rhs.item);
+  std::swap(storage, rhs.storage);
+  return *this;
+}
+
+template <typename T>
+T& Pool<T>::Loan::operator*() {
+  return *item->get();
+}
+
+template <typename T>
+T* Pool<T>::Loan::operator->() const {
+  return item->get();
+}
+
+template <typename T>
+T* Pool<T>::Loan::get() const {
+  return item->get();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// BoundedPool<T, N, POLICY>
+////////////////////////////////////////////////////////////////////////////////
+
+// BoundedPool<T, N, POLICY> is a pool of items of type T, with a maximum
+// capacity of N items.
+// BoundedPool<> is initially populated with N default-constructed items.
+// POLICY controls whether pool items are constructed and destructed each
+// time they are borrowed from and returned to the pool.
+template <typename T, int N, PoolPolicy POLICY = PoolPolicy::Reconstruct>
+class BoundedPool : public Pool<T> {
+ public:
+  using Item = typename Pool<T>::Item;
+  using Loan = typename Pool<T>::Loan;
+
+  // borrow() borrows a single item from the pool, blocking until an item is
+  // returned if the pool is empty.
+  inline Loan borrow() const;
+
+  // borrow() borrows count items from the pool, blocking until there are at
+  // least count items in the pool. The function f() is called with each
+  // borrowed item.
+  // F must be a function with the signature: void(T&&)
+  template <typename F>
+  inline void borrow(size_t count, const F& f) const;
+
+  // tryBorrow() attempts to borrow a single item from the pool without
+  // blocking.
+  // The boolean of the returned pair is true on success, or false if the pool
+  // is empty.
+  inline std::pair<Loan, bool> tryBorrow() const;
+
+ private:
+  class Storage : public Pool<T>::Storage {
+   public:
+    inline Storage();
+    inline ~Storage();
+    inline void return_(Item*) override;
+
+    std::mutex mutex;
+    ConditionVariable returned;
+    Item items[N];
+    Item* free = nullptr;
+  };
+  std::shared_ptr<Storage> storage = std::make_shared<Storage>();
+};
+
+template <typename T, int N, PoolPolicy POLICY>
+BoundedPool<T, N, POLICY>::Storage::Storage() {
+  for (int i = 0; i < N; i++) {
+    if (POLICY == PoolPolicy::Preserve) {
+      items[i].construct();
+    }
+    items[i].next = this->free;
+    this->free = &items[i];
+  }
+}
+
+template <typename T, int N, PoolPolicy POLICY>
+BoundedPool<T, N, POLICY>::Storage::~Storage() {
+  if (POLICY == PoolPolicy::Preserve) {
+    for (int i = 0; i < N; i++) {
+      items[i].destruct();
+    }
+  }
+}
+
+template <typename T, int N, PoolPolicy POLICY>
+typename BoundedPool<T, N, POLICY>::Loan BoundedPool<T, N, POLICY>::borrow()
+    const {
+  Loan out;
+  borrow(1, [&](Loan&& loan) { out = std::move(loan); });
+  return out;
+}
+
+template <typename T, int N, PoolPolicy POLICY>
+template <typename F>
+void BoundedPool<T, N, POLICY>::borrow(size_t n, const F& f) const {
+  std::unique_lock<std::mutex> lock(storage->mutex);
+  for (size_t i = 0; i < n; i++) {
+    storage->returned.wait(lock, [&] { return storage->free != nullptr; });
+    auto item = storage->free;
+    storage->free = storage->free->next;
+    if (POLICY == PoolPolicy::Reconstruct) {
+      item->construct();
+    }
+    f(std::move(Loan(item, storage)));
+  }
+}
+
+template <typename T, int N, PoolPolicy POLICY>
+std::pair<typename BoundedPool<T, N, POLICY>::Loan, bool>
+BoundedPool<T, N, POLICY>::tryBorrow() const {
+  std::unique_lock<std::mutex> lock(storage->mutex);
+  if (storage->free == nullptr) {
+    return std::make_pair(Loan(), false);
+  }
+  auto item = storage->free;
+  storage->free = storage->free->next;
+  item->pool = this;
+  lock.unlock();
+  if (POLICY == PoolPolicy::Reconstruct) {
+    item->construct();
+  }
+  return std::make_pair(Loan(item, storage), true);
+}
+
+template <typename T, int N, PoolPolicy POLICY>
+void BoundedPool<T, N, POLICY>::Storage::return_(Item* item) {
+  if (POLICY == PoolPolicy::Reconstruct) {
+    item->destruct();
+  }
+  std::unique_lock<std::mutex> lock(mutex);
+  item->next = free;
+  free = item;
+  lock.unlock();
+  returned.notify_one();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// UnboundedPool
+////////////////////////////////////////////////////////////////////////////////
+
+// UnboundedPool<T, POLICY> is a pool of items of type T.
+// UnboundedPool<> will automatically allocate more items if the pool becomes
+// empty.
+// POLICY controls whether pool items are constructed and destructed each
+// time they are borrowed from and returned to the pool.
+template <typename T, PoolPolicy POLICY = PoolPolicy::Reconstruct>
+class UnboundedPool : public Pool<T> {
+ public:
+  using Item = typename Pool<T>::Item;
+  using Loan = typename Pool<T>::Loan;
+
+  // borrow() borrows a single item from the pool, automatically allocating
+  // more items if the pool is empty.
+  // This function does not block.
+  inline Loan borrow() const;
+
+  // borrow() borrows count items from the pool, calling the function f() with
+  // each borrowed item.
+  // F must be a function with the signature: void(T&&)
+  // This function does not block.
+  template <typename F>
+  inline void borrow(size_t n, const F& f) const;
+
+ private:
+  class Storage : public Pool<T>::Storage {
+   public:
+    inline ~Storage();
+    inline void return_(Item*) override;
+
+    std::mutex mutex;
+    std::vector<Item*> items;
+    Item* free = nullptr;
+  };
+  std::shared_ptr<Storage> storage = std::make_shared<Storage>();
+};
+
+template <typename T, PoolPolicy POLICY>
+UnboundedPool<T, POLICY>::Storage::~Storage() {
+  for (auto item : items) {
+    if (POLICY == PoolPolicy::Preserve) {
+      item->destruct();
+    }
+    delete item;
+  }
+}
+
+template <typename T, PoolPolicy POLICY>
+Loan<T> UnboundedPool<T, POLICY>::borrow() const {
+  Loan out;
+  borrow(1, [&](Loan&& loan) { out = std::move(loan); });
+  return out;
+}
+
+template <typename T, PoolPolicy POLICY>
+template <typename F>
+inline void UnboundedPool<T, POLICY>::borrow(size_t n, const F& f) const {
+  std::unique_lock<std::mutex> lock(storage->mutex);
+  for (size_t i = 0; i < n; i++) {
+    if (storage->free == nullptr) {
+      auto count = std::max<size_t>(storage->items.size(), 32);
+      for (size_t i = 0; i < count; i++) {
+        auto item = new Item();
+        if (POLICY == PoolPolicy::Preserve) {
+          item->construct();
+        }
+        storage->items.push_back(item);
+        item->next = storage->free;
+        storage->free = item;
+      }
+    }
+
+    auto item = storage->free;
+    storage->free = storage->free->next;
+    if (POLICY == PoolPolicy::Reconstruct) {
+      item->construct();
+    }
+    f(std::move(Loan(item, storage)));
+  }
+}
+
+template <typename T, PoolPolicy POLICY>
+void UnboundedPool<T, POLICY>::Storage::return_(Item* item) {
+  if (POLICY == PoolPolicy::Reconstruct) {
+    item->destruct();
+  }
+  std::unique_lock<std::mutex> lock(mutex);
+  item->next = free;
+  free = item;
+  lock.unlock();
+}
+
+}  // namespace marl
+
+#endif  // marl_pool_h
--- a/include/marl/sal.h
+++ b/include/marl/sal.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Stubs SAL annotation macros for platforms that do not support them.
+// See
+// https://docs.microsoft.com/en-us/visualstudio/code-quality/annotating-locking-behavior?view=vs-2019
+
+#ifndef marl_sal_h
+#define marl_sal_h
+
+#ifndef _Requires_lock_held_
+#define _Requires_lock_held_(x)
+#endif
+
+#ifndef _Requires_lock_not_held_
+#define _Requires_lock_not_held_(x)
+#endif
+
+#endif  // marl_sal_h
--- a/include/marl/scheduler.h
+++ b/include/marl/scheduler.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_scheduler_h
+#define marl_scheduler_h
+
+#include "debug.h"
+#include "sal.h"
+
+#include <array>
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <unordered_map>
+
+namespace marl {
+
+class OSFiber;
+
+// Task is a unit of work for the scheduler.
+using Task = std::function<void()>;
+
+// Scheduler asynchronously processes Tasks.
+// A scheduler can be bound to one or more threads using the bind() method.
+// Once bound to a thread, that thread can call marl::schedule() to enqueue
+// work tasks to be executed asynchronously.
+// Scheduler are initially constructed in single-threaded mode.
+// Call setWorkerThreadCount() to spawn dedicated worker threads.
+class Scheduler {
+  class Worker;
+
+ public:
+  Scheduler();
+  ~Scheduler();
+
+  // get() returns the scheduler bound to the current thread.
+  static Scheduler* get();
+
+  // bind() binds this scheduler to the current thread.
+  // There must be no existing scheduler bound to the thread prior to calling.
+  void bind();
+
+  // unbind() unbinds the scheduler currently bound to the current thread.
+  // There must be a existing scheduler bound to the thread prior to calling.
+  static void unbind();
+
+  // enqueue() queues the task for asynchronous execution.
+  void enqueue(Task&& task);
+
+  // setThreadInitializer() sets the worker thread initializer function which
+  // will be called for each new worker thread spawned.
+  // The initializer will only be called on newly created threads (call
+  // setThreadInitializer() before setWorkerThreadCount()).
+  void setThreadInitializer(const std::function<void()>& init);
+
+  // getThreadInitializer() returns the thread initializer function set by
+  // setThreadInitializer().
+  const std::function<void()>& getThreadInitializer();
+
+  // setWorkerThreadCount() adjusts the number of dedicated worker threads.
+  // A count of 0 puts the scheduler into single-threaded mode.
+  // Note: Currently the number of threads cannot be adjusted once tasks
+  // have been enqueued. This restriction may be lifted at a later time.
+  void setWorkerThreadCount(int count);
+
+  // getWorkerThreadCount() returns the number of worker threads.
+  int getWorkerThreadCount();
+
+  // Fibers expose methods to perform cooperative multitasking and are
+  // automatically created by the Scheduler.
+  //
+  // The currently executing Fiber can be obtained by calling Fiber::current().
+  //
+  // When execution becomes blocked, yield() can be called to suspend execution
+  // of the fiber and start executing other pending work. Once the block has
+  // been lifted, schedule() can be called to reschedule the Fiber on the same
+  // thread that previously executed it.
+  class Fiber {
+   public:
+    ~Fiber();
+
+    // current() returns the currently executing fiber, or nullptr if called
+    // without a bound scheduler.
+    static Fiber* current();
+
+    // yield() suspends execution of this Fiber, allowing the thread to work
+    // on other tasks.
+    // yield() must only be called on the currently executing fiber.
+    void yield();
+
+    // schedule() reschedules the suspended Fiber for execution.
+    void schedule();
+
+    // id is the thread-unique identifier of the Fiber.
+    uint32_t const id;
+
+   private:
+    friend class Scheduler;
+
+    Fiber(OSFiber*, uint32_t id);
+
+    // switchTo() switches execution to the given fiber.
+    // switchTo() must only be called on the currently executing fiber.
+    void switchTo(Fiber*);
+
+    // create() constructs and returns a new fiber with the given identifier,
+    // stack size that will executed func when switched to.
+    static Fiber* create(uint32_t id,
+                         size_t stackSize,
+                         const std::function<void()>& func);
+
+    // createFromCurrentThread() constructs and returns a new fiber with the
+    // given identifier for the current thread.
+    static Fiber* createFromCurrentThread(uint32_t id);
+
+    OSFiber* const impl;
+    Worker* const worker;
+  };
+
+ private:
+  // Stack size in bytes of a new fiber.
+  // TODO: Make configurable so the default size can be reduced.
+  static constexpr size_t FiberStackSize = 1024 * 1024;
+
+  // Maximum number of worker threads.
+  static constexpr size_t MaxWorkerThreads = 64;
+
+  // TODO: Implement a queue that recycles elements to reduce number of
+  // heap allocations.
+  using TaskQueue = std::queue<Task>;
+  using FiberQueue = std::queue<Fiber*>;
+
+  // Workers executes Tasks on a single thread.
+  // Once a task is started, it may yield to other tasks on the same Worker.
+  // Tasks are always resumed by the same Worker.
+  class Worker {
+   public:
+    enum class Mode {
+      // Worker will spawn a background thread to process tasks.
+      MultiThreaded,
+
+      // Worker will execute tasks whenever it yields.
+      SingleThreaded,
+    };
+
+    Worker(Scheduler* scheduler, Mode mode, uint32_t id);
+
+    // start() begins execution of the worker.
+    void start();
+
+    // stop() ceases execution of the worker, blocking until all pending
+    // tasks have fully finished.
+    void stop();
+
+    // yield() suspends execution of the current task, and looks for other
+    // tasks to start or continue execution.
+    void yield(Fiber* fiber);
+
+    // enqueue(Fiber*) enqueues resuming of a suspended fiber.
+    void enqueue(Fiber* fiber);
+
+    // enqueue(Task&&) enqueues a new, unstarted task.
+    void enqueue(Task&& task);
+
+    // tryLock() attempts to lock the worker for task enqueing.
+    // If the lock was successful then true is returned, and the caller must
+    // call enqueueAndUnlock().
+    bool tryLock();
+
+    // enqueueAndUnlock() enqueues the task and unlocks the worker.
+    // Must only be called after a call to tryLock() which returned true.
+    void enqueueAndUnlock(Task&& task);
+
+    // flush() processes all pending tasks before returning.
+    void flush();
+
+    // dequeue() attempts to take a Task from the worker. Returns true if
+    // a task was taken and assigned to out, otherwise false.
+    bool dequeue(Task& out);
+
+    // getCurrent() returns the Worker currently bound to the current
+    // thread.
+    static inline Worker* getCurrent();
+
+    // getCurrentFiber() returns the Fiber currently being executed.
+    inline Fiber* getCurrentFiber() const;
+
+    // Unique identifier of the Worker.
+    const uint32_t id;
+
+   private:
+    // run() is the task processing function for the worker.
+    // If the worker was constructed in Mode::MultiThreaded, run() will
+    // continue to process tasks until stop() is called.
+    // If the worker was constructed in Mode::SingleThreaded, run() call
+    // flush() and return.
+    void run();
+
+    // createWorkerFiber() creates a new fiber that when executed calls
+    // run().
+    Fiber* createWorkerFiber();
+
+    // switchToFiber() switches execution to the given fiber. The fiber
+    // must belong to this worker.
+    void switchToFiber(Fiber*);
+
+    // runUntilIdle() executes all pending tasks and then returns.
+    _Requires_lock_held_(lock) void runUntilIdle(
+        std::unique_lock<std::mutex>& lock);
+
+    // waitForWork() blocks until new work is available, potentially calling
+    // spinForWork().
+    _Requires_lock_held_(lock) void waitForWork(
+        std::unique_lock<std::mutex>& lock);
+
+    // spinForWork() attempts to steal work from another Worker, and keeps
+    // the thread awake for a short duration. This reduces overheads of
+    // frequently putting the thread to sleep and re-waking.
+    void spinForWork();
+
+    // Work holds tasks and fibers that are enqueued on the Worker.
+    struct Work {
+      std::atomic<uint64_t> num = {0};  // tasks.size() + fibers.size()
+      TaskQueue tasks;                  // guarded by mutex
+      FiberQueue fibers;                // guarded by mutex
+      std::condition_variable added;
+      std::mutex mutex;
+    };
+
+    // https://en.wikipedia.org/wiki/Xorshift
+    class FastRnd {
+     public:
+      inline uint64_t operator()() {
+        x ^= x << 13;
+        x ^= x >> 7;
+        x ^= x << 17;
+        return x;
+      }
+
+     private:
+      uint64_t x = std::chrono::system_clock::now().time_since_epoch().count();
+    };
+
+    // The current worker bound to the current thread.
+    static thread_local Worker* current;
+
+    Mode const mode;
+    Scheduler* const scheduler;
+    std::unique_ptr<Fiber> mainFiber;
+    Fiber* currentFiber = nullptr;
+    std::thread thread;
+    Work work;
+    FiberQueue idleFibers;  // Fibers that have completed which can be reused.
+    std::vector<std::unique_ptr<Fiber>>
+        workerFibers;  // All fibers created by this worker.
+    FastRnd rng;
+    std::atomic<bool> shutdown = {false};
+  };
+
+  // stealWork() attempts to steal a task from the worker with the given id.
+  // Returns true if a task was stolen and assigned to out, otherwise false.
+  bool stealWork(Worker* thief, uint64_t from, Task& out);
+
+  // onBeginSpinning() is called when a Worker calls spinForWork().
+  // The scheduler will prioritize this worker for new tasks to try to prevent
+  // it going to sleep.
+  void onBeginSpinning(int workerId);
+
+  // The scheduler currently bound to the current thread.
+  static thread_local Scheduler* bound;
+
+  std::function<void()> threadInitFunc;
+  std::mutex threadInitFuncMutex;
+
+  std::array<std::atomic<int>, 8> spinningWorkers;
+  std::atomic<unsigned int> nextSpinningWorkerIdx = {0x8000000};
+
+  // TODO: Make this lot thread-safe so setWorkerThreadCount() can be called
+  // during execution of tasks.
+  std::atomic<unsigned int> nextEnqueueIndex = {0};
+  unsigned int numWorkerThreads = 0;
+  std::array<Worker*, MaxWorkerThreads> workerThreads;
+
+  std::mutex singleThreadedWorkerMutex;
+  std::unordered_map<std::thread::id, std::unique_ptr<Worker>>
+      singleThreadedWorkers;
+};
+
+Scheduler::Worker* Scheduler::Worker::getCurrent() {
+  return Worker::current;
+}
+
+Scheduler::Fiber* Scheduler::Worker::getCurrentFiber() const {
+  return currentFiber;
+}
+
+// schedule() schedules the function f to be asynchronously called with the
+// given arguments using the currently bound scheduler.
+template <typename Function, typename... Args>
+inline void schedule(Function&& f, Args&&... args) {
+  MARL_ASSERT_HAS_BOUND_SCHEDULER("marl::schedule");
+  auto scheduler = Scheduler::get();
+  scheduler->enqueue(
+      std::bind(std::forward<Function>(f), std::forward<Args>(args)...));
+}
+
+// schedule() schedules the function f to be asynchronously called using the
+// currently bound scheduler.
+template <typename Function>
+inline void schedule(Function&& f) {
+  MARL_ASSERT_HAS_BOUND_SCHEDULER("marl::schedule");
+  auto scheduler = Scheduler::get();
+  scheduler->enqueue(std::forward<Function>(f));
+}
+
+}  // namespace marl
+
+#endif  // marl_scheduler_h
--- a/include/marl/thread.h
+++ b/include/marl/thread.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_thread_h
+#define marl_thread_h
+
+namespace marl {
+
+// Thread contains static methods that abstract OS-specific thread / cpu
+// queries and control.
+class Thread {
+ public:
+  // setName() sets the name of the currently executing thread for displaying
+  // in a debugger.
+  static void setName(const char* fmt, ...);
+
+  // numLogicalCPUs() returns the number of available logical CPU cores for
+  // the system.
+  static unsigned int numLogicalCPUs();
+};
+
+}  // namespace marl
+
+#endif  // marl_thread_h
--- a/include/marl/ticket.h
+++ b/include/marl/ticket.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_ticket_h
+#define marl_ticket_h
+
+#include "conditionvariable.h"
+#include "pool.h"
+#include "scheduler.h"
+
+namespace marl {
+
+// Ticket is a synchronization primitive used to serially order execution.
+//
+// Tickets exist in 3 mutually exclusive states: Waiting, Called and Finished.
+//
+// Tickets are obtained from a Ticket::Queue, using the Ticket::Queue::take()
+// methods. The order in which tickets are taken from the queue dictates the
+// order in which they are called.
+//
+// The first ticket to be taken from a queue will be in the 'called' state,
+// others will be in the 'waiting' state until the previous ticket has finished.
+//
+// Ticket::wait() will block until the ticket is called.
+// Ticket::done() sets the ticket into the 'finished' state and calls the next
+// taken ticket from the queue.
+//
+// If a ticket is taken from a queue and does not have done() called before
+// its last reference is dropped, it will implicitly call done(), calling the
+// next ticket.
+//
+// Example:
+//
+//  void runTasksConcurrentThenSerially(int numConcurrentTasks)
+//  {
+//      marl::Ticket::Queue queue;
+//      for (int i = 0; i < numConcurrentTasks; i++)
+//      {
+//          auto ticket = queue.take();
+//          marl::schedule([=] {
+//              doConcurrentWork(); // <- function may be called concurrently
+//              ticket.wait(); // <- serialize tasks
+//              doSerialWork(); // <- function will not be called concurrently
+//              ticket.done(); // <- optional, as done() is called implicitly on
+//                             // dropping of last reference
+//          });
+//      }
+//  }
+class Ticket {
+  struct Shared;
+  struct Record;
+
+ public:
+  // Queue hands out Tickets.
+  class Queue {
+   public:
+    // take() returns a single ticket from the queue.
+    inline Ticket take();
+
+    // take() retrieves count tickets from the queue, calling f() with each
+    // retrieved ticket.
+    // F must be a function of the signature: void(Ticket&&)
+    template <typename F>
+    inline void take(size_t count, const F& f);
+
+   private:
+    std::shared_ptr<Shared> shared = std::make_shared<Shared>();
+    UnboundedPool<Record> pool;
+  };
+
+  inline Ticket() = default;
+  inline Ticket(const Ticket& other) = default;
+  inline Ticket(Ticket&& other) = default;
+  inline Ticket& operator=(const Ticket& other) = default;
+
+  // wait() blocks until the ticket is called.
+  inline void wait() const;
+
+  // done() marks the ticket as finished and calls the next ticket.
+  inline void done() const;
+
+  // onCall() registers the function f to be invoked when this ticket is
+  // called. If the ticket is already called prior to calling onCall(), then
+  // f() will be executed immediately.
+  // F must be a function of the signature: void F()
+  template <typename F>
+  inline void onCall(F&& f) const;
+
+ private:
+  // Internal doubly-linked-list data structure. One per ticket instance.
+  struct Record {
+    inline ~Record();
+
+    inline void done();
+    inline void callAndUnlock(std::unique_lock<std::mutex>& lock);
+
+    ConditionVariable isCalledCondVar;
+
+    std::shared_ptr<Shared> shared;
+    Record* next = nullptr;  // guarded by shared->mutex
+    Record* prev = nullptr;  // guarded by shared->mutex
+    inline void unlink();    // guarded by shared->mutex
+    Task onCall;             // guarded by shared->mutex
+    bool isCalled = false;   // guarded by shared->mutex
+    std::atomic<bool> isDone = {false};
+  };
+
+  // Data shared between all tickets and the queue.
+  struct Shared {
+    std::mutex mutex;
+    Record tail;
+  };
+
+  inline Ticket(Loan<Record>&& record);
+
+  Loan<Record> record;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Ticket
+////////////////////////////////////////////////////////////////////////////////
+
+Ticket::Ticket(Loan<Record>&& record) : record(std::move(record)) {}
+
+void Ticket::wait() const {
+  std::unique_lock<std::mutex> lock(record->shared->mutex);
+  record->isCalledCondVar.wait(lock, [this] { return record->isCalled; });
+}
+
+void Ticket::done() const {
+  record->done();
+}
+
+template <typename Function>
+void Ticket::onCall(Function&& f) const {
+  std::unique_lock<std::mutex> lock(record->shared->mutex);
+  if (record->isCalled) {
+    marl::schedule(std::move(f));
+    return;
+  }
+  if (record->onCall) {
+    struct Joined {
+      void operator()() const {
+        a();
+        b();
+      }
+      Task a, b;
+    };
+    record->onCall = std::move(Joined{std::move(record->onCall), std::move(f)});
+  } else {
+    record->onCall = std::move(f);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Ticket::Queue
+////////////////////////////////////////////////////////////////////////////////
+
+Ticket Ticket::Queue::take() {
+  Ticket out;
+  take(1, [&](Ticket&& ticket) { out = std::move(ticket); });
+  return out;
+}
+
+template <typename F>
+void Ticket::Queue::take(size_t n, const F& f) {
+  Loan<Record> first, last;
+  pool.borrow(n, [&](Loan<Record>&& record) {
+    Loan<Record> rec = std::move(record);
+    rec->shared = shared;
+    if (first.get() == nullptr) {
+      first = rec;
+    }
+    if (last.get() != nullptr) {
+      last->next = rec.get();
+      rec->prev = last.get();
+    }
+    last = rec;
+    f(std::move(Ticket(std::move(rec))));
+  });
+  last->next = &shared->tail;
+  std::unique_lock<std::mutex> lock(shared->mutex);
+  first->prev = shared->tail.prev;
+  shared->tail.prev = last.get();
+  if (first->prev == nullptr) {
+    first->callAndUnlock(lock);
+  } else {
+    first->prev->next = first.get();
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Ticket::Record
+////////////////////////////////////////////////////////////////////////////////
+
+Ticket::Record::~Record() {
+  if (shared != nullptr) {
+    done();
+  }
+}
+
+void Ticket::Record::done() {
+  if (isDone.exchange(true)) {
+    return;
+  }
+  std::unique_lock<std::mutex> lock(shared->mutex);
+  auto callNext = (prev == nullptr && next != nullptr) ? next : nullptr;
+  unlink();
+  if (callNext != nullptr) {
+    // lock needs to be held otherwise callNext might be destructed.
+    callNext->callAndUnlock(lock);
+  }
+}
+
+void Ticket::Record::callAndUnlock(std::unique_lock<std::mutex>& lock) {
+  if (isCalled) {
+    return;
+  }
+  isCalled = true;
+  Task task;
+  std::swap(task, onCall);
+  isCalledCondVar.notify_all();
+  lock.unlock();
+
+  if (task) {
+    marl::schedule(std::move(task));
+  }
+}
+
+void Ticket::Record::unlink() {
+  if (prev != nullptr) {
+    prev->next = next;
+  }
+  if (next != nullptr) {
+    next->prev = prev;
+  }
+  prev = nullptr;
+  next = nullptr;
+}
+
+}  // namespace marl
+
+#endif  // marl_ticket_h
--- a/include/marl/trace.h
+++ b/include/marl/trace.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The Trace API produces a trace event file that can be consumed with Chrome's
+// chrome://tracing viewer.
+// Documentation can be found at:
+//   https://www.chromium.org/developers/how-tos/trace-event-profiling-tool
+//   https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit
+
+#define MARL_TRACE_ENABLED 0
+
+#if MARL_TRACE_ENABLED
+
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <cstdarg>
+#include <cstring>
+#include <mutex>
+#include <ostream>
+#include <queue>
+#include <thread>
+
+namespace marl {
+
+// Trace writes a trace event file into the current working directory that can
+// be consumed with Chrome's chrome://tracing viewer.
+// Use the MARL_* macros below instead of using this class directly.
+class Trace {
+ public:
+  static constexpr size_t MaxEventNameLength = 64;
+
+  static Trace* get();
+
+  void nameThread(const char* fmt, ...);
+  void beginEvent(const char* fmt, ...);
+  void endEvent();
+  void beginAsyncEvent(uint32_t id, const char* fmt, ...);
+  void endAsyncEvent(uint32_t id, const char* fmt, ...);
+
+  class ScopedEvent {
+   public:
+    inline ScopedEvent(const char* fmt, ...);
+    inline ~ScopedEvent();
+
+   private:
+    Trace* const trace;
+  };
+
+  class ScopedAsyncEvent {
+   public:
+    inline ScopedAsyncEvent(uint32_t id, const char* fmt, ...);
+    inline ~ScopedAsyncEvent();
+
+   private:
+    Trace* const trace;
+    const uint32_t id;
+    std::string name;
+  };
+
+ private:
+  Trace();
+  ~Trace();
+  Trace(const Trace&) = delete;
+  Trace& operator=(const Trace&) = delete;
+
+  struct Event {
+    enum class Type : uint8_t {
+      Begin = 'B',
+      End = 'E',
+      Complete = 'X',
+      Instant = 'i',
+      Counter = 'C',
+      AsyncStart = 'b',
+      AsyncInstant = 'n',
+      AsyncEnd = 'e',
+      FlowStart = 's',
+      FlowStep = 't',
+      FlowEnd = 'f',
+      Sample = 'P',
+      ObjectCreated = 'N',
+      ObjectSnapshot = 'O',
+      ObjectDestroyed = 'D',
+      Metadata = 'M',
+      GlobalMemoryDump = 'V',
+      ProcessMemoryDump = 'v',
+      Mark = 'R',
+      ClockSync = 'c',
+      ContextEnter = '(',
+      ContextLeave = ')',
+
+      // Internal types
+      Shutdown = 'S',
+    };
+
+    Event();
+    virtual ~Event() = default;
+    virtual Type type() const = 0;
+    virtual void write(std::ostream& out) const;
+
+    char name[MaxEventNameLength] = {};
+    const char** categories = nullptr;
+    uint64_t timestamp = 0;  // in microseconds
+    uint32_t processID = 0;
+    uint32_t threadID;
+    uint32_t fiberID;
+  };
+
+  struct BeginEvent : public Event {
+    Type type() const override { return Type::Begin; }
+  };
+  struct EndEvent : public Event {
+    Type type() const override { return Type::End; }
+  };
+  struct MetadataEvent : public Event {
+    Type type() const override { return Type::Metadata; }
+  };
+  struct Shutdown : public Event {
+    Type type() const override { return Type::Shutdown; }
+  };
+
+  struct AsyncEvent : public Event {
+    void write(std::ostream& out) const override;
+    uint32_t id;
+  };
+
+  struct AsyncStartEvent : public AsyncEvent {
+    Type type() const override { return Type::AsyncStart; }
+  };
+  struct AsyncEndEvent : public AsyncEvent {
+    Type type() const override { return Type::AsyncEnd; }
+  };
+
+  struct NameThreadEvent : public MetadataEvent {
+    void write(std::ostream& out) const override;
+  };
+
+  uint64_t timestamp();  // in microseconds
+
+  void put(Event*);
+  std::unique_ptr<Event> take();
+
+  struct EventQueue {
+    std::queue<std::unique_ptr<Event> > data;  // guarded by mutes
+    std::condition_variable condition;
+    std::mutex mutex;
+  };
+  // TODO: Increasing this from 1 can cause events to go out of order.
+  // Investigate, fix.
+  std::array<EventQueue, 1> eventQueues;
+  std::atomic<unsigned int> eventQueueWriteIdx = {0};
+  unsigned int eventQueueReadIdx = 0;
+  std::chrono::time_point<std::chrono::high_resolution_clock> createdAt =
+      std::chrono::high_resolution_clock::now();
+  std::thread thread;
+  std::atomic<bool> stopped = {false};
+};
+
+Trace::ScopedEvent::ScopedEvent(const char* fmt, ...) : trace(Trace::get()) {
+  if (trace != nullptr) {
+    char name[Trace::MaxEventNameLength];
+    va_list vararg;
+    va_start(vararg, fmt);
+    vsnprintf(name, Trace::MaxEventNameLength, fmt, vararg);
+    va_end(vararg);
+
+    trace->beginEvent(name);
+  }
+}
+
+Trace::ScopedEvent::~ScopedEvent() {
+  if (trace != nullptr) {
+    trace->endEvent();
+  }
+}
+
+Trace::ScopedAsyncEvent::ScopedAsyncEvent(uint32_t id, const char* fmt, ...)
+    : trace(Trace::get()), id(id) {
+  if (trace != nullptr) {
+    char buf[Trace::MaxEventNameLength];
+    va_list vararg;
+    va_start(vararg, fmt);
+    vsnprintf(buf, Trace::MaxEventNameLength, fmt, vararg);
+    va_end(vararg);
+    name = buf;
+
+    trace->beginAsyncEvent(id, "%s", buf);
+  }
+}
+
+Trace::ScopedAsyncEvent::~ScopedAsyncEvent() {
+  if (trace != nullptr) {
+    trace->endAsyncEvent(id, "%s", name.c_str());
+  }
+}
+
+}  // namespace marl
+
+#define MARL_CONCAT_(a, b) a##b
+#define MARL_CONCAT(a, b) MARL_CONCAT_(a, b)
+#define MARL_SCOPED_EVENT(...) \
+  marl::Trace::ScopedEvent MARL_CONCAT(scoped_event, __LINE__)(__VA_ARGS__);
+#define MARL_BEGIN_ASYNC_EVENT(id, ...)    \
+  do {                                     \
+    if (auto t = marl::Trace::get()) {     \
+      t->beginAsyncEvent(id, __VA_ARGS__); \
+    }                                      \
+  } while (false);
+#define MARL_END_ASYNC_EVENT(id, ...)    \
+  do {                                   \
+    if (auto t = marl::Trace::get()) {   \
+      t->endAsyncEvent(id, __VA_ARGS__); \
+    }                                    \
+  } while (false);
+#define MARL_SCOPED_ASYNC_EVENT(id, ...) \
+  marl::Trace::ScopedAsyncEvent MARL_CONCAT(defer_, __LINE__)(id, __VA_ARGS__);
+#define MARL_NAME_THREAD(...)          \
+  do {                                 \
+    if (auto t = marl::Trace::get()) { \
+      t->nameThread(__VA_ARGS__);      \
+    }                                  \
+  } while (false);
+
+#else  // MARL_TRACE_ENABLED
+
+#define MARL_SCOPED_EVENT(...)
+#define MARL_BEGIN_ASYNC_EVENT(id, ...)
+#define MARL_END_ASYNC_EVENT(id, ...)
+#define MARL_SCOPED_ASYNC_EVENT(id, ...)
+#define MARL_NAME_THREAD(...)
+
+#endif  // MARL_TRACE_ENABLED
--- a/include/marl/utils.h
+++ b/include/marl/utils.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_util_h
+#define marl_util_h
+
+#include "scheduler.h"
+#include "waitgroup.h"
+
+namespace marl {
+
+// parallelize() is used to split a number of work items into N smaller batches
+// which can be processed in parallel with the function f().
+// numTotal is the total number of work items to process.
+// numPerTask is the maximum number of work items to process per call to f().
+// There will always be at least one call to f().
+// F must be a function with the signature:
+//    void(COUNTER taskIndex, COUNTER first, COUNTER count)
+// COUNTER is any integer type.
+template <typename F, typename COUNTER>
+inline void parallelize(COUNTER numTotal, COUNTER numPerTask, const F& f) {
+  auto numTasks = (numTotal + numPerTask - 1) / numPerTask;
+  WaitGroup wg(numTasks - 1);
+  for (unsigned int task = 1; task < numTasks; task++) {
+    schedule([=] {
+      auto first = task * numPerTask;
+      auto count = std::min(first + numPerTask, numTotal) - first;
+      f(task, first, count);
+      wg.done();
+    });
+  }
+
+  // Run the first chunk on this fiber to reduce the amount of time spent
+  // waiting.
+  f(0, 0, std::min(numPerTask, numTotal));
+  wg.wait();
+}
+
+}  // namespace marl
+
+#endif  // marl_util_h
--- a/include/marl/waitgroup.h
+++ b/include/marl/waitgroup.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_waitgroup_h
+#define marl_waitgroup_h
+
+#include "conditionvariable.h"
+#include "debug.h"
+
+#include <atomic>
+#include <mutex>
+
+namespace marl {
+
+// WaitGroup is a synchronization primitive that holds an internal counter that
+// can incremented, decremented and waited on until it reaches 0.
+// WaitGroups can be used as a simple mechanism for waiting on a number of
+// concurrently execute a number of tasks to complete.
+//
+// Example:
+//
+//  void runTasksConcurrently(int numConcurrentTasks)
+//  {
+//      // Construct the WaitGroup with an initial count of numConcurrentTasks.
+//      marl::WaitGroup wg(numConcurrentTasks);
+//      for (int i = 0; i < numConcurrentTasks; i++)
+//      {
+//          // Schedule a task to be run asynchronously.
+//          // These may all be run concurrently.
+//          marl::schedule([=] {
+//              // Once the task has finished, decrement the waitgroup counter
+//              // to signal that this has completed.
+//              defer(wg.done());
+//              doSomeWork();
+//          });
+//      }
+//      // Block until all tasks have completed.
+//      wg.wait();
+//  }
+class WaitGroup {
+ public:
+  // Constructs the WaitGroup with the specified initial count.
+  inline WaitGroup(unsigned int initialCount = 0);
+
+  // add() increments the internal counter by count.
+  inline void add(unsigned int count = 1) const;
+
+  // done() decrements the internal counter by one.
+  // Returns true if the internal count has reached zero.
+  inline bool done() const;
+
+  // wait() blocks until the WaitGroup counter reaches zero.
+  inline void wait() const;
+
+ private:
+  struct Data {
+    std::atomic<unsigned int> count = {0};
+    ConditionVariable condition;
+    std::mutex mutex;
+  };
+  const std::shared_ptr<Data> data = std::make_shared<Data>();
+};
+
+inline WaitGroup::WaitGroup(unsigned int initialCount /* = 0 */) {
+  data->count = initialCount;
+}
+
+void WaitGroup::add(unsigned int count /* = 1 */) const {
+  data->count += count;
+}
+
+bool WaitGroup::done() const {
+  MARL_ASSERT(data->count > 0, "marl::WaitGroup::done() called too many times");
+  auto count = --data->count;
+  if (count == 0) {
+    std::unique_lock<std::mutex> lock(data->mutex);
+    data->condition.notify_all();
+    return true;
+  }
+  return false;
+}
+
+void WaitGroup::wait() const {
+  std::unique_lock<std::mutex> lock(data->mutex);
+  data->condition.wait(lock, [this] { return data->count == 0; });
+}
+
+}  // namespace marl
+
+#endif  // marl_waitgroup_h
--- a/kokoro/linux/presubmit.cfg
+++ b/kokoro/linux/presubmit.cfg
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Location of the continuous bash script in Git.
+build_file: "marl/kokoro/linux/presubmit.sh"
--- a/kokoro/linux/presubmit.sh
+++ b/kokoro/linux/presubmit.sh
+#!/bin/bash
+
+set -e # Fail on any error.
+set -x # Display commands being run.
+
+cd github/marl
+
+git submodule update --init
+
+mkdir build
+cd build
+
+build_and_run() {
+    cmake .. -DMARL_BUILD_EXAMPLES=1 $1
+    make --jobs=$(nproc)
+
+    ./marl-unittests
+    ./fractal
+}
+
+build_and_run ""
+build_and_run "-DMARL_ASAN=1"
+build_and_run "-DMARL_MSAN=1"
\ No newline at end of file
--- a/kokoro/macos/presubmit.cfg
+++ b/kokoro/macos/presubmit.cfg
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Location of the continuous bash script in Git.
+build_file: "marl/kokoro/macos/presubmit.sh"
--- a/kokoro/macos/presubmit.sh
+++ b/kokoro/macos/presubmit.sh
+#!/bin/bash
+
+set -e # Fail on any error.
+set -x # Display commands being run.
+
+cd github/marl
+
+git submodule update --init
+
+mkdir build
+cd build
+cmake .. -DMARL_BUILD_EXAMPLES=1
+make -j$(sysctl -n hw.logicalcpu)
+
+./marl-unittests
+
+./fractal
--- a/kokoro/windows/presubmit.bat
+++ b/kokoro/windows/presubmit.bat
+@echo on
+
+SETLOCAL ENABLEDELAYEDEXPANSION
+
+SET PATH=C:\python36;C:\Program Files\cmake\bin;%PATH%
+set SRC=%cd%\github\marl
+
+cd %SRC%
+if !ERRORLEVEL! neq 0 exit /b !ERRORLEVEL!
+
+git submodule update --init
+if !ERRORLEVEL! neq 0 exit /b !ERRORLEVEL!
+
+SET MSBUILD="C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild"
+SET CONFIG=Release
+
+mkdir %SRC%\build
+cd %SRC%\build
+if !ERRORLEVEL! neq 0 exit /b !ERRORLEVEL!
+
+cmake .. -G "Visual Studio 15 2017 Win64" -Thost=x64 "-DMARL_BUILD_EXAMPLES=1"
+if !ERRORLEVEL! neq 0 exit /b !ERRORLEVEL!
+
+%MSBUILD% /p:Configuration=%CONFIG% Marl.sln
+if !ERRORLEVEL! neq 0 exit /b !ERRORLEVEL!
+
+Release\marl-unittests.exe
+if !ERRORLEVEL! neq 0 exit /b !ERRORLEVEL!
+
+Release\fractal.exe
+if !ERRORLEVEL! neq 0 exit /b !ERRORLEVEL!
--- a/kokoro/windows/presubmit.cfg
+++ b/kokoro/windows/presubmit.cfg
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Location of the continuous bash script in Git.
+build_file: "marl/kokoro/windows/presubmit.bat"
--- a/src/blockingcall_test.cpp
+++ b/src/blockingcall_test.cpp
+// Copyright 2019 The Marl Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/blockingcall.h"
+
+#include "marl/defer.h"
+
+#include "marl_test.h"
+
+#include <mutex>
+
+TEST_P(WithBoundScheduler, BlockingCall) {
+  auto mutex = std::make_shared<std::mutex>();
+  mutex->lock();
+
+  marl::WaitGroup wg(100);
+  for (int i = 0; i < 100; i++) {
+    marl::schedule([=] {
+      defer(wg.done());
+      marl::blocking_call([=] {
+        mutex->lock();
+        defer(mutex->unlock());
+      });
+    });
+  }
+
+  marl::schedule([=] { mutex->unlock(); });
+  wg.wait();
+}
--- a/src/conditionvariable_test.cpp
+++ b/src/conditionvariable_test.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/conditionvariable.h"
+
+#include "marl_test.h"
+
+TEST(WithoutBoundScheduler, ConditionVariable) {
+  bool trigger[3] = {false, false, false};
+  bool signal[3] = {false, false, false};
+  std::mutex mutex;
+  marl::ConditionVariable cv;
+
+  std::thread thread([&] {
+    for (int i = 0; i < 3; i++) {
+      std::unique_lock<std::mutex> lock(mutex);
+      cv.wait(lock, [&] { return trigger[i]; });
+      signal[i] = true;
+      cv.notify_one();
+    }
+  });
+
+  ASSERT_FALSE(signal[0]);
+  ASSERT_FALSE(signal[1]);
+  ASSERT_FALSE(signal[2]);
+
+  for (int i = 0; i < 3; i++) {
+    {
+      std::unique_lock<std::mutex> lock(mutex);
+      trigger[i] = true;
+      cv.notify_one();
+      cv.wait(lock, [&] { return signal[i]; });
+    }
+
+    ASSERT_EQ(signal[0], 0 <= i);
+    ASSERT_EQ(signal[1], 1 <= i);
+    ASSERT_EQ(signal[2], 2 <= i);
+  }
+
+  thread.join();
+}
+
+TEST_P(WithBoundScheduler, ConditionVariable) {
+  bool trigger[3] = {false, false, false};
+  bool signal[3] = {false, false, false};
+  std::mutex mutex;
+  marl::ConditionVariable cv;
+
+  std::thread thread([&] {
+    for (int i = 0; i < 3; i++) {
+      std::unique_lock<std::mutex> lock(mutex);
+      cv.wait(lock, [&] { return trigger[i]; });
+      signal[i] = true;
+      cv.notify_one();
+    }
+  });
+
+  ASSERT_FALSE(signal[0]);
+  ASSERT_FALSE(signal[1]);
+  ASSERT_FALSE(signal[2]);
+
+  for (int i = 0; i < 3; i++) {
+    {
+      std::unique_lock<std::mutex> lock(mutex);
+      trigger[i] = true;
+      cv.notify_one();
+      cv.wait(lock, [&] { return signal[i]; });
+    }
+
+    ASSERT_EQ(signal[0], 0 <= i);
+    ASSERT_EQ(signal[1], 1 <= i);
+    ASSERT_EQ(signal[2], 2 <= i);
+  }
+
+  thread.join();
+}
--- a/src/containers_test.cpp
+++ b/src/containers_test.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/containers.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <cstddef>
+#include <string>
+
+class ContainersVectorTest : public testing::Test {};
+
+TEST(ContainersVectorTest, Empty) {
+  marl::containers::vector<std::string, 4> vector;
+  ASSERT_EQ(vector.size(), size_t(0));
+}
+
+TEST(ContainersVectorTest, WithinFixedCapIndex) {
+  marl::containers::vector<std::string, 4> vector;
+  vector.resize(4);
+  vector[0] = "A";
+  vector[1] = "B";
+  vector[2] = "C";
+  vector[3] = "D";
+
+  ASSERT_EQ(vector[0], "A");
+  ASSERT_EQ(vector[1], "B");
+  ASSERT_EQ(vector[2], "C");
+  ASSERT_EQ(vector[3], "D");
+}
+
+TEST(ContainersVectorTest, BeyondFixedCapIndex) {
+  marl::containers::vector<std::string, 1> vector;
+  vector.resize(4);
+  vector[0] = "A";
+  vector[1] = "B";
+  vector[2] = "C";
+  vector[3] = "D";
+
+  ASSERT_EQ(vector[0], "A");
+  ASSERT_EQ(vector[1], "B");
+  ASSERT_EQ(vector[2], "C");
+  ASSERT_EQ(vector[3], "D");
+}
+
+TEST(ContainersVectorTest, WithinFixedCapPushPop) {
+  marl::containers::vector<std::string, 4> vector;
+  vector.push_back("A");
+  vector.push_back("B");
+  vector.push_back("C");
+  vector.push_back("D");
+
+  ASSERT_EQ(vector.size(), size_t(4));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(4));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "D");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(3));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(3));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "C");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(2));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(2));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "B");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(1));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(1));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "A");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(0));
+}
+
+TEST(ContainersVectorTest, BeyondFixedCapPushPop) {
+  marl::containers::vector<std::string, 2> vector;
+  vector.push_back("A");
+  vector.push_back("B");
+  vector.push_back("C");
+  vector.push_back("D");
+
+  ASSERT_EQ(vector.size(), size_t(4));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(4));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "D");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(3));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(3));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "C");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(2));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(2));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "B");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(1));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(1));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "A");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(0));
+}
+
+TEST(ContainersVectorTest, CopyConstruct) {
+  marl::containers::vector<std::string, 4> vectorA;
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  marl::containers::vector<std::string, 2> vectorB(vectorA);
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
+
+TEST(ContainersVectorTest, MoveConstruct) {
+  marl::containers::vector<std::string, 4> vectorA;
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  marl::containers::vector<std::string, 2> vectorB(std::move(vectorA));
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
+
+TEST(ContainersVectorTest, Copy) {
+  marl::containers::vector<std::string, 4> vectorA;
+  marl::containers::vector<std::string, 2> vectorB;
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  vectorB.resize(1);
+  vectorB[0] = "Z";
+
+  vectorB = vectorA;
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
+
+TEST(ContainersVectorTest, Move) {
+  marl::containers::vector<std::string, 4> vectorA;
+  marl::containers::vector<std::string, 2> vectorB;
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  vectorB.resize(1);
+  vectorB[0] = "Z";
+
+  vectorB = std::move(vectorA);
+  ASSERT_EQ(vectorA.size(), size_t(0));
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
--- a/src/debug.cpp
+++ b/src/debug.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/debug.h"
+#include "marl/scheduler.h"
+
+#include <cstdarg>
+#include <cstdlib>
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace marl {
+
+void fatal(const char* msg, ...) {
+  va_list vararg;
+  va_start(vararg, msg);
+  vfprintf(stderr, msg, vararg);
+  va_end(vararg);
+  abort();
+}
+
+void assert_has_bound_scheduler(const char* feature) {
+  MARL_ASSERT(Scheduler::get() != nullptr,
+              "%s requires a marl::Scheduler to be bound", feature);
+}
+
+}  // namespace marl
--- a/src/defer_test.cpp
+++ b/src/defer_test.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/defer.h"
+
+#include "marl_test.h"
+
+TEST(WithoutBoundScheduler, Defer) {
+  bool deferCalled = false;
+  { defer(deferCalled = true); }
+  ASSERT_TRUE(deferCalled);
+}
+
+TEST(WithoutBoundScheduler, DeferOrder) {
+  int counter = 0;
+  int a = 0, b = 0, c = 0;
+  {
+    defer(a = ++counter);
+    defer(b = ++counter);
+    defer(c = ++counter);
+  }
+  ASSERT_EQ(a, 3);
+  ASSERT_EQ(b, 2);
+  ASSERT_EQ(c, 1);
+}
\ No newline at end of file
--- a/src/marl_test.cpp
+++ b/src/marl_test.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+INSTANTIATE_TEST_SUITE_P(
+    SchedulerParams,
+    WithBoundScheduler,
+    testing::Values(SchedulerParams{0},  // Single-threaded mode test
+                    SchedulerParams{1},  // Single worker thread
+                    SchedulerParams{2},  // 2 worker threads...
+                    SchedulerParams{4},
+                    SchedulerParams{8},
+                    SchedulerParams{64}));
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/src/marl_test.h
+++ b/src/marl_test.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "marl/scheduler.h"
+
+// SchedulerParams holds Scheduler construction parameters for testing.
+struct SchedulerParams {
+  int numWorkerThreads;
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const SchedulerParams& params) {
+    return os << "SchedulerParams{"
+              << "numWorkerThreads: " << params.numWorkerThreads << "}";
+  }
+};
+
+// WithoutBoundScheduler is a test fixture that does not bind a scheduler.
+class WithoutBoundScheduler : public testing::Test {};
+
+// WithBoundScheduler is a parameterized test fixture that performs tests with
+// a bound scheduler using a number of different configurations.
+class WithBoundScheduler : public testing::TestWithParam<SchedulerParams> {
+ public:
+  void SetUp() override {
+    auto& params = GetParam();
+
+    auto scheduler = new marl::Scheduler();
+    scheduler->bind();
+    scheduler->setWorkerThreadCount(params.numWorkerThreads);
+  }
+
+  void TearDown() override {
+    auto scheduler = marl::Scheduler::get();
+    scheduler->unbind();
+    delete scheduler;
+  }
+};
--- a/src/osfiber.h
+++ b/src/osfiber.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(_WIN32)
+#include "osfiber_windows.h"
+#elif defined(MARL_FIBERS_USE_UCONTEXT)
+#include "osfiber_ucontext.h"
+#else
+#include "osfiber_asm.h"
+#endif
--- a/src/osfiber_aarch64.c
+++ b/src/osfiber_aarch64.c
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__aarch64__)
+
+#include "osfiber_asm_aarch64.h"
+
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->LR = (uintptr_t)&marl_fiber_trampoline;
+  ctx->r0 = (uintptr_t)target;
+  ctx->r1 = (uintptr_t)arg;
+  ctx->SP = ((uintptr_t)stack_top) & ~(uintptr_t)15;
+}
+
+#endif  // defined(__aarch64__)
--- a/src/osfiber_arm.c
+++ b/src/osfiber_arm.c
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__arm__)
+
+#include "osfiber_asm_arm.h"
+
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->LR = (uintptr_t)&marl_fiber_trampoline;
+  ctx->r0 = (uintptr_t)target;
+  ctx->r1 = (uintptr_t)arg;
+  ctx->SP = ((uintptr_t)stack_top) & ~(uintptr_t)15;
+}
+
+#endif  // defined(__arm__)
--- a/src/osfiber_asm.h
+++ b/src/osfiber_asm.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Minimal assembly implementations of fiber context switching for Unix-based
+// platforms.
+//
+// Note: Unlike makecontext, swapcontext or the Windows fiber APIs, these
+// assembly implementations *do not* save or restore signal masks,
+// floating-point control or status registers, FS and GS segment registers,
+// thread-local storage state nor any SIMD registers. This should not be a
+// problem as the marl scheduler requires fibers to be executed on a single
+// thread.
+
+#if defined(__x86_64__)
+#include "osfiber_asm_x64.h"
+#elif defined(__i386__)
+#include "osfiber_asm_x86.h"
+#elif defined(__aarch64__)
+#include "osfiber_asm_aarch64.h"
+#elif defined(__arm__)
+#include "osfiber_asm_arm.h"
+#elif defined(__powerpc64__)
+#include "osfiber_asm_ppc64.h"
+#else
+#error "Unsupported target"
+#endif
+
+#include <functional>
+#include <memory>
+
+extern "C" {
+
+extern void marl_fiber_set_target(marl_fiber_context*,
+                                  void* stack,
+                                  uint32_t stack_size,
+                                  void (*target)(void*),
+                                  void* arg);
+extern void marl_fiber_swap(marl_fiber_context* from,
+                            const marl_fiber_context* to);
+
+}  // extern "C"
+
+namespace marl {
+
+class OSFiber {
+ public:
+  // createFiberFromCurrentThread() returns a fiber created from the current
+  // thread.
+  static inline OSFiber* createFiberFromCurrentThread();
+
+  // createFiber() returns a new fiber with the given stack size that will
+  // call func when switched to. func() must end by switching back to another
+  // fiber, and must not return.
+  static inline OSFiber* createFiber(size_t stackSize,
+                                     const std::function<void()>& func);
+
+  // switchTo() immediately switches execution to the given fiber.
+  // switchTo() must be called on the currently executing fiber.
+  inline void switchTo(OSFiber*);
+
+ private:
+  static inline void run(OSFiber* self);
+
+  marl_fiber_context context;
+  std::function<void()> target;
+  std::unique_ptr<uint8_t[]> stack;
+};
+
+OSFiber* OSFiber::createFiberFromCurrentThread() {
+  auto out = new OSFiber();
+  out->context = {};
+  return out;
+}
+
+OSFiber* OSFiber::createFiber(size_t stackSize,
+                              const std::function<void()>& func) {
+  auto out = new OSFiber();
+  out->context = {};
+  out->target = func;
+  out->stack = std::unique_ptr<uint8_t[]>(new uint8_t[stackSize]);
+  marl_fiber_set_target(&out->context, out->stack.get(), stackSize,
+                        reinterpret_cast<void (*)(void*)>(&OSFiber::run), out);
+  return out;
+}
+
+void OSFiber::run(OSFiber* self) {
+  self->target();
+}
+
+void OSFiber::switchTo(OSFiber* fiber) {
+  marl_fiber_swap(&context, &fiber->context);
+}
+
+}  // namespace marl
--- a/src/osfiber_asm_aarch64.S
+++ b/src/osfiber_asm_aarch64.S
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__aarch64__)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_aarch64.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// x0: from
+// x1: to
+.text
+.global MARL_ASM_SYMBOL(marl_fiber_swap)
+.align 4
+MARL_ASM_SYMBOL(marl_fiber_swap):
+
+    // Save context 'from'
+    // TODO: pairs of str can be combined with stp.
+
+    // Store special purpose registers
+    str x16, [x0, #MARL_REG_r16]
+    str x17, [x0, #MARL_REG_r17]
+    str x18, [x0, #MARL_REG_r18]
+
+    // Store callee-preserved registers
+    str x19, [x0, #MARL_REG_r19]
+    str x20, [x0, #MARL_REG_r20]
+    str x21, [x0, #MARL_REG_r21]
+    str x22, [x0, #MARL_REG_r22]
+    str x23, [x0, #MARL_REG_r23]
+    str x24, [x0, #MARL_REG_r24]
+    str x25, [x0, #MARL_REG_r25]
+    str x26, [x0, #MARL_REG_r26]
+    str x27, [x0, #MARL_REG_r27]
+    str x28, [x0, #MARL_REG_r28]
+
+    str d8,  [x0, #MARL_REG_v8]
+    str d9,  [x0, #MARL_REG_v9]
+    str d10, [x0, #MARL_REG_v10]
+    str d11, [x0, #MARL_REG_v11]
+    str d12, [x0, #MARL_REG_v12]
+    str d13, [x0, #MARL_REG_v13]
+    str d14, [x0, #MARL_REG_v14]
+    str d15, [x0, #MARL_REG_v15]
+
+    // Store sp and lr
+    mov x2, sp
+    str x2,  [x0, #MARL_REG_SP]
+    str x30, [x0, #MARL_REG_LR]
+
+    // Load context 'to'
+    mov x7, x1
+
+    // Load special purpose registers
+    ldr x16, [x7, #MARL_REG_r16]
+    ldr x17, [x7, #MARL_REG_r17]
+    ldr x18, [x7, #MARL_REG_r18]
+
+    // Load callee-preserved registers
+    ldr x19, [x7, #MARL_REG_r19]
+    ldr x20, [x7, #MARL_REG_r20]
+    ldr x21, [x7, #MARL_REG_r21]
+    ldr x22, [x7, #MARL_REG_r22]
+    ldr x23, [x7, #MARL_REG_r23]
+    ldr x24, [x7, #MARL_REG_r24]
+    ldr x25, [x7, #MARL_REG_r25]
+    ldr x26, [x7, #MARL_REG_r26]
+    ldr x27, [x7, #MARL_REG_r27]
+    ldr x28, [x7, #MARL_REG_r28]
+
+    ldr d8,  [x7, #MARL_REG_v8]
+    ldr d9,  [x7, #MARL_REG_v9]
+    ldr d10, [x7, #MARL_REG_v10]
+    ldr d11, [x7, #MARL_REG_v11]
+    ldr d12, [x7, #MARL_REG_v12]
+    ldr d13, [x7, #MARL_REG_v13]
+    ldr d14, [x7, #MARL_REG_v14]
+    ldr d15, [x7, #MARL_REG_v15]
+
+    // Load parameter registers
+    ldr x0, [x7, #MARL_REG_r0]
+    ldr x1, [x7, #MARL_REG_r1]
+
+    // Load sp and lr
+    ldr x30, [x7, #MARL_REG_LR]
+    ldr x2,  [x7, #MARL_REG_SP]
+    mov sp, x2
+
+    ret
+
+#endif // defined(__aarch64__)
--- a/src/osfiber_asm_aarch64.h
+++ b/src/osfiber_asm_aarch64.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_r0 0x00
+#define MARL_REG_r1 0x08
+#define MARL_REG_r16 0x10
+#define MARL_REG_r17 0x18
+#define MARL_REG_r18 0x20
+#define MARL_REG_r19 0x28
+#define MARL_REG_r20 0x30
+#define MARL_REG_r21 0x38
+#define MARL_REG_r22 0x40
+#define MARL_REG_r23 0x48
+#define MARL_REG_r24 0x50
+#define MARL_REG_r25 0x58
+#define MARL_REG_r26 0x60
+#define MARL_REG_r27 0x68
+#define MARL_REG_r28 0x70
+#define MARL_REG_v8 0x78
+#define MARL_REG_v9 0x80
+#define MARL_REG_v10 0x88
+#define MARL_REG_v11 0x90
+#define MARL_REG_v12 0x98
+#define MARL_REG_v13 0xa0
+#define MARL_REG_v14 0xa8
+#define MARL_REG_v15 0xb0
+#define MARL_REG_SP 0xb8
+#define MARL_REG_LR 0xc0
+
+#if defined(__APPLE__)
+#define MARL_ASM_SYMBOL(x) _##x
+#else
+#define MARL_ASM_SYMBOL(x) x
+#endif
+
+#ifndef MARL_BUILD_ASM
+
+#include <stdint.h>
+
+// Procedure Call Standard for the ARM 64-bit Architecture
+// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+struct marl_fiber_context {
+  // parameter registers
+  uintptr_t r0;
+  uintptr_t r1;
+
+  // special purpose registers
+  uintptr_t r16;
+  uintptr_t r17;
+  uintptr_t r18;  // platform specific (maybe inter-procedural state)
+
+  // callee-saved registers
+  uintptr_t r19;
+  uintptr_t r20;
+  uintptr_t r21;
+  uintptr_t r22;
+  uintptr_t r23;
+  uintptr_t r24;
+  uintptr_t r25;
+  uintptr_t r26;
+  uintptr_t r27;
+  uintptr_t r28;
+
+  uintptr_t v8;
+  uintptr_t v9;
+  uintptr_t v10;
+  uintptr_t v11;
+  uintptr_t v12;
+  uintptr_t v13;
+  uintptr_t v14;
+  uintptr_t v15;
+
+  uintptr_t SP;  // stack pointer
+  uintptr_t LR;  // link register (R30)
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, r0) == MARL_REG_r0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r1) == MARL_REG_r1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r16) == MARL_REG_r16,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r17) == MARL_REG_r17,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r18) == MARL_REG_r18,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r19) == MARL_REG_r19,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r20) == MARL_REG_r20,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r21) == MARL_REG_r21,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r22) == MARL_REG_r22,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r23) == MARL_REG_r23,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r24) == MARL_REG_r24,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r25) == MARL_REG_r25,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r26) == MARL_REG_r26,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r27) == MARL_REG_r27,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r28) == MARL_REG_r28,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v8) == MARL_REG_v8,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v9) == MARL_REG_v9,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v10) == MARL_REG_v10,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v11) == MARL_REG_v11,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v12) == MARL_REG_v12,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v13) == MARL_REG_v13,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v14) == MARL_REG_v14,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v15) == MARL_REG_v15,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, SP) == MARL_REG_SP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, LR) == MARL_REG_LR,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/src/osfiber_asm_arm.S
+++ b/src/osfiber_asm_arm.S
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__arm__)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_arm.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// x0: from
+// x1: to
+.text
+.global marl_fiber_swap
+.align 4
+marl_fiber_swap:
+
+    // Save context 'from'
+    // TODO: multiple registers can be stored in a single instruction with: stm rA, {rB-rC}
+
+    // Store special purpose registers
+    str r12, [r0, #MARL_REG_r12]
+
+    // Store callee-preserved registers
+    str r4, [r0, #MARL_REG_r4]
+    str r5, [r0, #MARL_REG_r5]
+    str r6, [r0, #MARL_REG_r6]
+    str r7, [r0, #MARL_REG_r7]
+    str r8, [r0, #MARL_REG_r8]
+    str r9, [r0, #MARL_REG_r9]
+    str r10, [r0, #MARL_REG_r10]
+    str r11, [r0, #MARL_REG_r11]
+
+    // Store sp, lr and pc
+    str sp, [r0, #MARL_REG_SP]
+    str lr, [r0, #MARL_REG_LR]
+
+    // Load context 'to'
+    // TODO: multiple registers can be loaded in a single instruction with: ldm rA, {rB-rC}
+    mov r3, r1
+
+    // Load special purpose registers
+    ldr r12, [r3, #MARL_REG_r12]
+
+    // Load callee-preserved registers
+    ldr r4, [r3, #MARL_REG_r4]
+    ldr r5, [r3, #MARL_REG_r5]
+    ldr r6, [r3, #MARL_REG_r6]
+    ldr r7, [r3, #MARL_REG_r7]
+    ldr r8, [r3, #MARL_REG_r8]
+    ldr r9, [r3, #MARL_REG_r9]
+    ldr r10, [r3, #MARL_REG_r10]
+    ldr r11, [r3, #MARL_REG_r11]
+
+    // Load parameter registers
+    ldr r0, [r3, #MARL_REG_r0]
+    ldr r1, [r3, #MARL_REG_r1]
+
+    // Load sp, lr and pc
+    ldr sp, [r3, #MARL_REG_SP]
+    ldr lr, [r3, #MARL_REG_LR]
+    mov pc, lr
+
+#endif // defined(__arm__)
--- a/src/osfiber_asm_arm.h
+++ b/src/osfiber_asm_arm.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_r0 0x00
+#define MARL_REG_r1 0x04
+#define MARL_REG_r12 0x08
+#define MARL_REG_r4 0x0c
+#define MARL_REG_r5 0x10
+#define MARL_REG_r6 0x14
+#define MARL_REG_r7 0x18
+#define MARL_REG_r8 0x1c
+#define MARL_REG_r9 0x20
+#define MARL_REG_r10 0x24
+#define MARL_REG_r11 0x28
+#define MARL_REG_v8 0x2c
+#define MARL_REG_v9 0x30
+#define MARL_REG_v10 0x34
+#define MARL_REG_v11 0x38
+#define MARL_REG_v12 0x3c
+#define MARL_REG_v13 0x40
+#define MARL_REG_v14 0x44
+#define MARL_REG_v15 0x48
+#define MARL_REG_SP 0x4c
+#define MARL_REG_LR 0x50
+
+#ifndef MARL_BUILD_ASM
+#include <stdint.h>
+
+// Procedure Call Standard for the ARM 64-bit Architecture
+// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+struct marl_fiber_context {
+  // parameter registers
+  uintptr_t r0;
+  uintptr_t r1;
+
+  // special purpose registers
+  uintptr_t r12;  // Intra-Procedure-call
+
+  // callee-saved registers
+  uintptr_t r4;
+  uintptr_t r5;
+  uintptr_t r6;
+  uintptr_t r7;
+  uintptr_t r8;
+  uintptr_t r9;
+  uintptr_t r10;
+  uintptr_t r11;
+
+  uintptr_t v8;
+  uintptr_t v9;
+  uintptr_t v10;
+  uintptr_t v11;
+  uintptr_t v12;
+  uintptr_t v13;
+  uintptr_t v14;
+  uintptr_t v15;
+
+  uintptr_t SP;  // stack pointer (r13)
+  uintptr_t LR;  // link register (r14)
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, r0) == MARL_REG_r0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r1) == MARL_REG_r1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r12) == MARL_REG_r12,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r4) == MARL_REG_r4,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r5) == MARL_REG_r5,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r6) == MARL_REG_r6,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r7) == MARL_REG_r7,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r8) == MARL_REG_r8,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r9) == MARL_REG_r9,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r10) == MARL_REG_r10,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r11) == MARL_REG_r11,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v8) == MARL_REG_v8,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v9) == MARL_REG_v9,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v10) == MARL_REG_v10,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v11) == MARL_REG_v11,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v12) == MARL_REG_v12,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v13) == MARL_REG_v13,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v14) == MARL_REG_v14,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v15) == MARL_REG_v15,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, SP) == MARL_REG_SP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, LR) == MARL_REG_LR,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/src/osfiber_asm_ppc64.S
+++ b/src/osfiber_asm_ppc64.S
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__powerpc64__)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_ppc64.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// r3: from
+// r4: to
+.text
+.global marl_fiber_swap
+.align 4
+.type marl_fiber_swap @function
+marl_fiber_swap:
+
+    // Store non-volatile registers
+    std 1, MARL_REG_R1(3)
+    std 2, MARL_REG_R2(3)
+    std 13, MARL_REG_R13(3)
+    std 14, MARL_REG_R14(3)
+    std 15, MARL_REG_R15(3)
+    std 16, MARL_REG_R16(3)
+    std 17, MARL_REG_R17(3)
+    std 18, MARL_REG_R18(3)
+    std 19, MARL_REG_R19(3)
+    std 20, MARL_REG_R20(3)
+    std 21, MARL_REG_R21(3)
+    std 22, MARL_REG_R22(3)
+    std 23, MARL_REG_R23(3)
+    std 24, MARL_REG_R24(3)
+    std 25, MARL_REG_R25(3)
+    std 26, MARL_REG_R26(3)
+    std 27, MARL_REG_R27(3)
+    std 28, MARL_REG_R28(3)
+    std 29, MARL_REG_R29(3)
+    std 30, MARL_REG_R30(3)
+    std 31, MARL_REG_R31(3)
+
+    // Store special registers
+    mflr 5
+    std 5, MARL_REG_LR(3)
+    mfcr 5
+    std 5, MARL_REG_CCR(3)
+
+    // Store non-volatile floating point registers
+    stfd 14, MARL_REG_FPR14(3)
+    stfd 15, MARL_REG_FPR15(3)
+    stfd 16, MARL_REG_FPR16(3)
+    stfd 17, MARL_REG_FPR17(3)
+    stfd 18, MARL_REG_FPR18(3)
+    stfd 19, MARL_REG_FPR19(3)
+    stfd 20, MARL_REG_FPR20(3)
+    stfd 21, MARL_REG_FPR21(3)
+    stfd 22, MARL_REG_FPR22(3)
+    stfd 23, MARL_REG_FPR23(3)
+    stfd 24, MARL_REG_FPR24(3)
+    stfd 25, MARL_REG_FPR25(3)
+    stfd 26, MARL_REG_FPR26(3)
+    stfd 27, MARL_REG_FPR27(3)
+    stfd 28, MARL_REG_FPR28(3)
+    stfd 29, MARL_REG_FPR29(3)
+    stfd 30, MARL_REG_FPR30(3)
+    stfd 31, MARL_REG_FPR31(3)
+
+    // Store non-volatile altivec registers
+#ifdef __ALTIVEC__
+    li 5, MARL_REG_VMX
+    stvxl 20, 3, 5
+    addi 5, 5, 16
+    stvxl 21, 3, 5
+    addi 5, 5, 16
+    stvxl 22, 3, 5
+    addi 5, 5, 16
+    stvxl 23, 3, 5
+    addi 5, 5, 16
+    stvxl 24, 3, 5
+    addi 5, 5, 16
+    stvxl 25, 3, 5
+    addi 5, 5, 16
+    stvxl 26, 3, 5
+    addi 5, 5, 16
+    stvxl 27, 3, 5
+    addi 5, 5, 16
+    stvxl 28, 3, 5
+    addi 5, 5, 16
+    stvxl 29, 3, 5
+    addi 5, 5, 16
+    stvxl 30, 3, 5
+    addi 5, 5, 16
+    stvxl 31, 3, 5
+
+    mfvrsave 5
+    stw 5, MARL_REG_VRSAVE(3)
+#endif // __ALTIVEC__
+
+    // Load non-volatile registers
+    ld 1, MARL_REG_R1(4)
+    ld 2, MARL_REG_R2(4)
+    ld 13, MARL_REG_R13(4)
+    ld 14, MARL_REG_R14(4)
+    ld 15, MARL_REG_R15(4)
+    ld 16, MARL_REG_R16(4)
+    ld 17, MARL_REG_R17(4)
+    ld 18, MARL_REG_R18(4)
+    ld 19, MARL_REG_R19(4)
+    ld 20, MARL_REG_R20(4)
+    ld 21, MARL_REG_R21(4)
+    ld 22, MARL_REG_R22(4)
+    ld 23, MARL_REG_R23(4)
+    ld 24, MARL_REG_R24(4)
+    ld 25, MARL_REG_R25(4)
+    ld 26, MARL_REG_R26(4)
+    ld 27, MARL_REG_R27(4)
+    ld 28, MARL_REG_R28(4)
+    ld 29, MARL_REG_R29(4)
+    ld 30, MARL_REG_R30(4)
+    ld 31, MARL_REG_R31(4)
+
+    // Load non-volatile floating point registers
+    lfd 14, MARL_REG_FPR14(4)
+    lfd 15, MARL_REG_FPR15(4)
+    lfd 16, MARL_REG_FPR16(4)
+    lfd 17, MARL_REG_FPR17(4)
+    lfd 18, MARL_REG_FPR18(4)
+    lfd 19, MARL_REG_FPR19(4)
+    lfd 20, MARL_REG_FPR20(4)
+    lfd 21, MARL_REG_FPR21(4)
+    lfd 22, MARL_REG_FPR22(4)
+    lfd 23, MARL_REG_FPR23(4)
+    lfd 24, MARL_REG_FPR24(4)
+    lfd 25, MARL_REG_FPR25(4)
+    lfd 26, MARL_REG_FPR26(4)
+    lfd 27, MARL_REG_FPR27(4)
+    lfd 28, MARL_REG_FPR28(4)
+    lfd 29, MARL_REG_FPR29(4)
+    lfd 30, MARL_REG_FPR30(4)
+    lfd 31, MARL_REG_FPR31(4)
+
+    // Load non-volatile altivec registers
+#ifdef __ALTIVEC__
+    li 5, MARL_REG_VMX
+    lvxl 20, 4, 5
+    addi 5, 5, 16
+    lvxl 21, 4, 5
+    addi 5, 5, 16
+    lvxl 22, 4, 5
+    addi 5, 5, 16
+    lvxl 23, 4, 5
+    addi 5, 5, 16
+    lvxl 24, 4, 5
+    addi 5, 5, 16
+    lvxl 25, 4, 5
+    addi 5, 5, 16
+    lvxl 26, 4, 5
+    addi 5, 5, 16
+    lvxl 27, 4, 5
+    addi 5, 5, 16
+    lvxl 28, 4, 5
+    addi 5, 5, 16
+    lvxl 29, 4, 5
+    addi 5, 5, 16
+    lvxl 30, 4, 5
+    addi 5, 5, 16
+    lvxl 31, 4, 5
+
+    lwz 5, MARL_REG_VRSAVE(4)
+    mtvrsave 5
+#endif // __ALTIVEC__
+
+    // Load parameters and entrypoint
+    ld 12, MARL_REG_LR(4)
+    ld 3, MARL_REG_R3(4)
+    ld 4, MARL_REG_R4(4)
+    mtlr 12
+
+    // Branch to entrypoint
+    blr
+
+#endif // defined(__powerpc64__)
--- a/src/osfiber_asm_ppc64.h
+++ b/src/osfiber_asm_ppc64.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_R1 0x00
+#define MARL_REG_R2 0x08
+#define MARL_REG_R13 0x10
+#define MARL_REG_R14 0x18
+#define MARL_REG_R15 0x20
+#define MARL_REG_R16 0x28
+#define MARL_REG_R17 0x30
+#define MARL_REG_R18 0x38
+#define MARL_REG_R19 0x40
+#define MARL_REG_R20 0x48
+#define MARL_REG_R21 0x50
+#define MARL_REG_R22 0x58
+#define MARL_REG_R23 0x60
+#define MARL_REG_R24 0x68
+#define MARL_REG_R25 0x70
+#define MARL_REG_R26 0x78
+#define MARL_REG_R27 0x80
+#define MARL_REG_R28 0x88
+#define MARL_REG_R29 0x90
+#define MARL_REG_R30 0x98
+#define MARL_REG_R31 0xa0
+
+#define MARL_REG_R3 0xa8
+#define MARL_REG_R4 0xb0
+
+#define MARL_REG_LR 0xb8
+#define MARL_REG_CCR 0xc0
+
+#define MARL_REG_FPR14 0xc8
+#define MARL_REG_FPR15 0xd0
+#define MARL_REG_FPR16 0xd8
+#define MARL_REG_FPR17 0xe0
+#define MARL_REG_FPR18 0xe8
+#define MARL_REG_FPR19 0xf0
+#define MARL_REG_FPR20 0xf8
+#define MARL_REG_FPR21 0x100
+#define MARL_REG_FPR22 0x108
+#define MARL_REG_FPR23 0x110
+#define MARL_REG_FPR24 0x118
+#define MARL_REG_FPR25 0x120
+#define MARL_REG_FPR26 0x128
+#define MARL_REG_FPR27 0x130
+#define MARL_REG_FPR28 0x138
+#define MARL_REG_FPR29 0x140
+#define MARL_REG_FPR30 0x148
+#define MARL_REG_FPR31 0x150
+
+#define MARL_REG_VRSAVE 0x158
+#define MARL_REG_VMX 0x160
+
+#ifndef MARL_BUILD_ASM
+
+#include <stdint.h>
+
+struct marl_fiber_context {
+  // non-volatile registers
+  uintptr_t r1;
+  uintptr_t r2;
+  uintptr_t r13;
+  uintptr_t r14;
+  uintptr_t r15;
+  uintptr_t r16;
+  uintptr_t r17;
+  uintptr_t r18;
+  uintptr_t r19;
+  uintptr_t r20;
+  uintptr_t r21;
+  uintptr_t r22;
+  uintptr_t r23;
+  uintptr_t r24;
+  uintptr_t r25;
+  uintptr_t r26;
+  uintptr_t r27;
+  uintptr_t r28;
+  uintptr_t r29;
+  uintptr_t r30;
+  uintptr_t r31;
+
+  // first two parameter registers (r3, r4)
+  uintptr_t r3;
+  uintptr_t r4;
+
+  // special registers
+  uintptr_t lr;
+  uintptr_t ccr;
+
+  // non-volatile floating-point registers (f14-f31)
+  uintptr_t fpr14;
+  uintptr_t fpr15;
+  uintptr_t fpr16;
+  uintptr_t fpr17;
+  uintptr_t fpr18;
+  uintptr_t fpr19;
+  uintptr_t fpr20;
+  uintptr_t fpr21;
+  uintptr_t fpr22;
+  uintptr_t fpr23;
+  uintptr_t fpr24;
+  uintptr_t fpr25;
+  uintptr_t fpr26;
+  uintptr_t fpr27;
+  uintptr_t fpr28;
+  uintptr_t fpr29;
+  uintptr_t fpr30;
+  uintptr_t fpr31;
+
+  // non-volatile altivec registers
+  uint32_t vrsave;
+  uintptr_t vmx[12 * 2];
+};
+
+// Only the ELFv2 ABI is supported for now.
+#if !defined(_CALL_ELF) || (_CALL_ELF != 2)
+#error "Only the ppc64 ELFv2 ABI is supported."
+#endif
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, r1) == MARL_REG_R1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r2) == MARL_REG_R2,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r13) == MARL_REG_R13,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r15) == MARL_REG_R15,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r16) == MARL_REG_R16,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r17) == MARL_REG_R17,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r18) == MARL_REG_R18,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r19) == MARL_REG_R19,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r20) == MARL_REG_R20,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r21) == MARL_REG_R21,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r22) == MARL_REG_R22,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r23) == MARL_REG_R23,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r24) == MARL_REG_R24,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r25) == MARL_REG_R25,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r26) == MARL_REG_R26,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r27) == MARL_REG_R27,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r28) == MARL_REG_R28,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r29) == MARL_REG_R29,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r30) == MARL_REG_R30,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r31) == MARL_REG_R31,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r14) == MARL_REG_R14,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, lr) == MARL_REG_LR,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, ccr) == MARL_REG_CCR,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr14) == MARL_REG_FPR14,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr15) == MARL_REG_FPR15,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr16) == MARL_REG_FPR16,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr17) == MARL_REG_FPR17,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr18) == MARL_REG_FPR18,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr19) == MARL_REG_FPR19,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr20) == MARL_REG_FPR20,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr21) == MARL_REG_FPR21,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr22) == MARL_REG_FPR22,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr23) == MARL_REG_FPR23,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr24) == MARL_REG_FPR24,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr25) == MARL_REG_FPR25,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr26) == MARL_REG_FPR26,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr27) == MARL_REG_FPR27,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr28) == MARL_REG_FPR28,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr29) == MARL_REG_FPR29,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr30) == MARL_REG_FPR30,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr31) == MARL_REG_FPR31,
+              "Bad register offset");
+static_assert((offsetof(marl_fiber_context, vmx) % 16) == 0,
+              "VMX must be quadword aligned");
+static_assert(offsetof(marl_fiber_context, vmx) == MARL_REG_VMX,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, vrsave) == MARL_REG_VRSAVE,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/src/osfiber_asm_x64.S
+++ b/src/osfiber_asm_x64.S
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__x86_64__)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_x64.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// rdi: from
+// rsi: to
+.text
+.global MARL_ASM_SYMBOL(marl_fiber_swap)
+.align 4
+MARL_ASM_SYMBOL(marl_fiber_swap):
+
+    // Save context 'from'
+
+    // Store callee-preserved registers
+    movq        %rbx, MARL_REG_RBX(%rdi)
+    movq        %rbp, MARL_REG_RBP(%rdi)
+    movq        %r12, MARL_REG_R12(%rdi)
+    movq        %r13, MARL_REG_R13(%rdi)
+    movq        %r14, MARL_REG_R14(%rdi)
+    movq        %r15, MARL_REG_R15(%rdi)
+
+    movq        (%rsp), %rcx             /* call stores the return address on the stack before jumping */
+    movq        %rcx, MARL_REG_RIP(%rdi)
+    leaq        8(%rsp), %rcx            /* skip the pushed return address */
+    movq        %rcx, MARL_REG_RSP(%rdi)
+
+    // Load context 'to'
+    movq        %rsi, %r8
+
+    // Load callee-preserved registers
+    movq        MARL_REG_RBX(%r8), %rbx
+    movq        MARL_REG_RBP(%r8), %rbp
+    movq        MARL_REG_R12(%r8), %r12
+    movq        MARL_REG_R13(%r8), %r13
+    movq        MARL_REG_R14(%r8), %r14
+    movq        MARL_REG_R15(%r8), %r15
+
+    // Load first two call parameters
+    movq        MARL_REG_RDI(%r8), %rdi
+    movq        MARL_REG_RSI(%r8), %rsi
+
+    // Load stack pointer
+    movq        MARL_REG_RSP(%r8), %rsp
+
+    // Load instruction pointer, and jump
+    movq        MARL_REG_RIP(%r8), %rcx
+    jmp         *%rcx
+
+#endif // defined(__x86_64__)
--- a/src/osfiber_asm_x64.h
+++ b/src/osfiber_asm_x64.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_RBX 0x00
+#define MARL_REG_RBP 0x08
+#define MARL_REG_R12 0x10
+#define MARL_REG_R13 0x18
+#define MARL_REG_R14 0x20
+#define MARL_REG_R15 0x28
+#define MARL_REG_RDI 0x30
+#define MARL_REG_RSI 0x38
+#define MARL_REG_RSP 0x40
+#define MARL_REG_RIP 0x48
+
+#if defined(__APPLE__)
+#define MARL_ASM_SYMBOL(x) _##x
+#else
+#define MARL_ASM_SYMBOL(x) x
+#endif
+
+#ifndef MARL_BUILD_ASM
+
+#include <stdint.h>
+
+struct marl_fiber_context {
+  // callee-saved registers
+  uintptr_t RBX;
+  uintptr_t RBP;
+  uintptr_t R12;
+  uintptr_t R13;
+  uintptr_t R14;
+  uintptr_t R15;
+
+  // parameter registers
+  uintptr_t RDI;
+  uintptr_t RSI;
+
+  // stack and instruction registers
+  uintptr_t RSP;
+  uintptr_t RIP;
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, RBX) == MARL_REG_RBX,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, RBP) == MARL_REG_RBP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, R12) == MARL_REG_R12,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, R13) == MARL_REG_R13,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, R14) == MARL_REG_R14,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, R15) == MARL_REG_R15,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, RDI) == MARL_REG_RDI,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, RSI) == MARL_REG_RSI,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, RSP) == MARL_REG_RSP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, RIP) == MARL_REG_RIP,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/src/osfiber_asm_x86.S
+++ b/src/osfiber_asm_x86.S
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__i386__)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_x86.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// esp+4: from
+// esp+8: to
+.text
+.global marl_fiber_swap
+.align 4
+marl_fiber_swap:
+    // Save context 'from'
+    movl        4(%esp), %eax
+
+    // Store callee-preserved registers
+    movl        %ebx, MARL_REG_EBX(%eax)
+    movl        %ebp, MARL_REG_EBP(%eax)
+    movl        %esi, MARL_REG_ESI(%eax)
+    movl        %edi, MARL_REG_EDI(%eax)
+
+    movl        (%esp), %ecx             /* call stores the return address on the stack before jumping */
+    movl        %ecx, MARL_REG_EIP(%eax)
+    lea         4(%esp), %ecx            /* skip the pushed return address */
+    movl        %ecx, MARL_REG_ESP(%eax)
+
+    // Load context 'to'
+    movl        8(%esp), %ecx
+
+    // Load callee-preserved registers
+    movl        MARL_REG_EBX(%ecx), %ebx
+    movl        MARL_REG_EBP(%ecx), %ebp
+    movl        MARL_REG_ESI(%ecx), %esi
+    movl        MARL_REG_EDI(%ecx), %edi
+
+    // Load stack pointer
+    movl        MARL_REG_ESP(%ecx), %esp
+
+    // Load instruction pointer, and jump
+    movl        MARL_REG_EIP(%ecx), %ecx
+    jmp         *%ecx
+
+#endif // defined(__i386__)
--- a/src/osfiber_asm_x86.h
+++ b/src/osfiber_asm_x86.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_EBX 0x00
+#define MARL_REG_EBP 0x04
+#define MARL_REG_ESI 0x08
+#define MARL_REG_EDI 0x0c
+#define MARL_REG_ESP 0x10
+#define MARL_REG_EIP 0x14
+
+#ifndef MARL_BUILD_ASM
+#include <stdint.h>
+
+// Assumes cdecl calling convention.
+// Registers EAX, ECX, and EDX are caller-saved, and the rest are callee-saved.
+struct marl_fiber_context {
+  // callee-saved registers
+  uintptr_t EBX;
+  uintptr_t EBP;
+  uintptr_t ESI;
+  uintptr_t EDI;
+
+  // stack and instruction registers
+  uintptr_t ESP;
+  uintptr_t EIP;
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, EBX) == MARL_REG_EBX,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, EBP) == MARL_REG_EBP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, ESI) == MARL_REG_ESI,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, EDI) == MARL_REG_EDI,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, ESP) == MARL_REG_ESP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, EIP) == MARL_REG_EIP,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/src/osfiber_ppc64.c
+++ b/src/osfiber_ppc64.c
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__powerpc64__)
+
+#include "osfiber_asm_ppc64.h"
+
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t stack_top = (uintptr_t)((uint8_t*)(stack) + stack_size);
+  if ((stack_top % 16) != 0)
+    stack_top -= (stack_top % 16);
+
+  // Write a backchain and subtract a minimum stack frame size (32)
+  *(uintptr_t*)stack_top = 0;
+  stack_top -= 32;
+  *(uintptr_t*)stack_top = stack_top + 32;
+
+  // Load registers
+  ctx->r1 = stack_top;
+  ctx->lr = (uintptr_t)marl_fiber_trampoline;
+  ctx->r3 = (uintptr_t)target;
+  ctx->r4 = (uintptr_t)arg;
+
+  // Thread pointer must be saved in r13
+  __asm__ volatile("mr %0, 13\n" : "=r"(ctx->r13));
+}
+
+#endif  // __powerpc64__
--- a/src/osfiber_test.cpp
+++ b/src/osfiber_test.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "osfiber.h"
+
+#include "marl_test.h"
+
+TEST(WithoutBoundScheduler, OSFiber) {
+  std::string str;
+  auto constexpr fiberStackSize = 8 * 1024;
+  auto main = std::unique_ptr<marl::OSFiber>(
+      marl::OSFiber::createFiberFromCurrentThread());
+  std::unique_ptr<marl::OSFiber> fiberA, fiberB, fiberC;
+  fiberC = std::unique_ptr<marl::OSFiber>(
+      marl::OSFiber::createFiber(fiberStackSize, [&] {
+        str += "C";
+        fiberC->switchTo(fiberB.get());
+      }));
+  fiberB = std::unique_ptr<marl::OSFiber>(
+      marl::OSFiber::createFiber(fiberStackSize, [&] {
+        str += "B";
+        fiberB->switchTo(fiberA.get());
+      }));
+  fiberA = std::unique_ptr<marl::OSFiber>(
+      marl::OSFiber::createFiber(fiberStackSize, [&] {
+        str += "A";
+        fiberA->switchTo(main.get());
+      }));
+
+  main->switchTo(fiberC.get());
+
+  ASSERT_EQ(str, "CBA");
+}
--- a/src/osfiber_ucontext.h
+++ b/src/osfiber_ucontext.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if !defined(_XOPEN_SOURCE)
+// This must come before other #includes, otherwise we'll end up with ucontext_t
+// definition mismatches, leading to memory corruption hilarity.
+#define _XOPEN_SOURCE
+#endif  //  !defined(_XOPEN_SOURCE)
+
+#include "marl/debug.h"
+
+#include <functional>
+#include <memory>
+
+#include <ucontext.h>
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif  // defined(__clang__)
+
+namespace marl {
+
+class OSFiber {
+ public:
+  // createFiberFromCurrentThread() returns a fiber created from the current
+  // thread.
+  static inline OSFiber* createFiberFromCurrentThread();
+
+  // createFiber() returns a new fiber with the given stack size that will
+  // call func when switched to. func() must end by switching back to another
+  // fiber, and must not return.
+  static inline OSFiber* createFiber(size_t stackSize,
+                                     const std::function<void()>& func);
+
+  // switchTo() immediately switches execution to the given fiber.
+  // switchTo() must be called on the currently executing fiber.
+  inline void switchTo(OSFiber*);
+
+ private:
+  std::unique_ptr<uint8_t[]> stack;
+  ucontext_t context;
+  std::function<void()> target;
+};
+
+OSFiber* OSFiber::createFiberFromCurrentThread() {
+  auto out = new OSFiber();
+  out->context = {};
+  getcontext(&out->context);
+  return out;
+}
+
+OSFiber* OSFiber::createFiber(size_t stackSize,
+                              const std::function<void()>& func) {
+  union Args {
+    OSFiber* self;
+    struct {
+      int a;
+      int b;
+    };
+  };
+
+  struct Target {
+    static void Main(int a, int b) {
+      Args u;
+      u.a = a;
+      u.b = b;
+      std::function<void()> func;
+      std::swap(func, u.self->target);
+      func();
+    }
+  };
+
+  auto out = new OSFiber();
+  out->context = {};
+  out->stack = std::unique_ptr<uint8_t[]>(new uint8_t[stackSize]);
+  out->target = func;
+
+  auto alignmentOffset =
+      15 - (reinterpret_cast<uintptr_t>(out->stack.get() + 15) & 15);
+  auto res = getcontext(&out->context);
+  MARL_ASSERT(res == 0, "getcontext() returned %d", int(res));
+  out->context.uc_stack.ss_sp = out->stack.get() + alignmentOffset;
+  out->context.uc_stack.ss_size = stackSize - alignmentOffset;
+  out->context.uc_link = nullptr;
+
+  Args args;
+  args.self = out;
+  makecontext(&out->context, reinterpret_cast<void (*)()>(&Target::Main), 2,
+              args.a, args.b);
+
+  return out;
+}
+
+void OSFiber::switchTo(OSFiber* fiber) {
+  auto res = swapcontext(&context, &fiber->context);
+  MARL_ASSERT(res == 0, "swapcontext() returned %d", int(res));
+}
+
+}  // namespace marl
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif  // defined(__clang__)
--- a/src/osfiber_windows.h
+++ b/src/osfiber_windows.h
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <functional>
+#include <memory>
+
+#include <Windows.h>
+
+namespace marl {
+
+class OSFiber {
+ public:
+  inline ~OSFiber();
+
+  // createFiberFromCurrentThread() returns a fiber created from the current
+  // thread.
+  static inline OSFiber* createFiberFromCurrentThread();
+
+  // createFiber() returns a new fiber with the given stack size that will
+  // call func when switched to. func() must end by switching back to another
+  // fiber, and must not return.
+  static inline OSFiber* createFiber(size_t stackSize,
+                                     const std::function<void()>& func);
+
+  // switchTo() immediately switches execution to the given fiber.
+  // switchTo() must be called on the currently executing fiber.
+  inline void switchTo(OSFiber*);
+
+ private:
+  static inline void WINAPI run(void* self);
+  LPVOID fiber = nullptr;
+  bool isFiberFromThread = false;
+  std::function<void()> target;
+};
+
+OSFiber::~OSFiber() {
+  if (fiber != nullptr) {
+    if (isFiberFromThread) {
+      ConvertFiberToThread();
+    } else {
+      DeleteFiber(fiber);
+    }
+  }
+}
+
+OSFiber* OSFiber::createFiberFromCurrentThread() {
+  auto out = new OSFiber();
+  out->fiber = ConvertThreadToFiber(nullptr);
+  out->isFiberFromThread = true;
+  return out;
+}
+
+OSFiber* OSFiber::createFiber(size_t stackSize,
+                              const std::function<void()>& func) {
+  auto out = new OSFiber();
+  out->fiber = CreateFiber(stackSize, &OSFiber::run, out);
+  out->target = func;
+  return out;
+}
+
+void OSFiber::switchTo(OSFiber* fiber) {
+  SwitchToFiber(fiber->fiber);
+}
+
+void WINAPI OSFiber::run(void* self) {
+  std::function<void()> func;
+  std::swap(func, reinterpret_cast<OSFiber*>(self)->target);
+  func();
+}
+
+}  // namespace marl
--- a/src/osfiber_x64.c
+++ b/src/osfiber_x64.c
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__x86_64__)
+
+#include "osfiber_asm_x64.h"
+
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->RIP = (uintptr_t)&marl_fiber_trampoline;
+  ctx->RDI = (uintptr_t)target;
+  ctx->RSI = (uintptr_t)arg;
+  ctx->RSP = (uintptr_t)&stack_top[-3];
+  stack_top[-2] = 0;  // No return target.
+}
+
+#endif  // defined(__x86_64__)
--- a/src/osfiber_x86.c
+++ b/src/osfiber_x86.c
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__i386__)
+
+#include "osfiber_asm_x86.h"
+
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->EIP = (uintptr_t)&marl_fiber_trampoline;
+  ctx->ESP = (uintptr_t)&stack_top[-3];
+  stack_top[-1] = (uintptr_t)arg;
+  stack_top[-2] = (uintptr_t)target;
+  stack_top[-3] = 0;  // No return target.
+}
+
+#endif  // defined(__i386__)
--- a/src/pool_test.cpp
+++ b/src/pool_test.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+#include "marl/pool.h"
+#include "marl/waitgroup.h"
+
+TEST_P(WithBoundScheduler, UnboundedPool_ConstructDestruct) {
+  marl::UnboundedPool<int> pool;
+}
+
+TEST_P(WithBoundScheduler, BoundedPool_ConstructDestruct) {
+  marl::BoundedPool<int, 10> pool;
+}
+
+TEST_P(WithBoundScheduler, UnboundedPool_Borrow) {
+  marl::UnboundedPool<int> pool;
+  for (int i = 0; i < 100; i++) {
+    pool.borrow();
+  }
+}
+
+TEST_P(WithBoundScheduler, UnboundedPool_ConcurrentBorrow) {
+  marl::UnboundedPool<int> pool;
+  constexpr int iterations = 10000;
+  marl::WaitGroup wg(iterations);
+  for (int i = 0; i < iterations; i++) {
+    marl::schedule([=] {
+      pool.borrow();
+      wg.done();
+    });
+  }
+  wg.wait();
+}
+
+TEST_P(WithBoundScheduler, BoundedPool_Borrow) {
+  marl::BoundedPool<int, 100> pool;
+  for (int i = 0; i < 100; i++) {
+    pool.borrow();
+  }
+}
+
+TEST_P(WithBoundScheduler, BoundedPool_ConcurrentBorrow) {
+  marl::BoundedPool<int, 10> pool;
+  constexpr int iterations = 10000;
+  marl::WaitGroup wg(iterations);
+  for (int i = 0; i < iterations; i++) {
+    marl::schedule([=] {
+      pool.borrow();
+      wg.done();
+    });
+  }
+  wg.wait();
+}
+
+struct CtorDtorCounter {
+  CtorDtorCounter() { ctor_count++; }
+  ~CtorDtorCounter() { dtor_count++; }
+  static void reset() {
+    ctor_count = 0;
+    dtor_count = 0;
+  }
+  static int ctor_count;
+  static int dtor_count;
+};
+
+int CtorDtorCounter::ctor_count = -1;
+int CtorDtorCounter::dtor_count = -1;
+
+TEST_P(WithBoundScheduler, UnboundedPool_PolicyReconstruct) {
+  CtorDtorCounter::reset();
+  marl::UnboundedPool<CtorDtorCounter, marl::PoolPolicy::Reconstruct> pool;
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 0);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  {
+    auto loan = pool.borrow();
+    ASSERT_EQ(CtorDtorCounter::ctor_count, 1);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 1);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 1);
+  {
+    auto loan = pool.borrow();
+    ASSERT_EQ(CtorDtorCounter::ctor_count, 2);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 1);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 2);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 2);
+}
+
+TEST_P(WithBoundScheduler, BoundedPool_PolicyReconstruct) {
+  CtorDtorCounter::reset();
+  marl::BoundedPool<CtorDtorCounter, 10, marl::PoolPolicy::Reconstruct> pool;
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 0);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  {
+    auto loan = pool.borrow();
+    ASSERT_EQ(CtorDtorCounter::ctor_count, 1);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 1);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 1);
+  {
+    auto loan = pool.borrow();
+    ASSERT_EQ(CtorDtorCounter::ctor_count, 2);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 1);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 2);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 2);
+}
+
+TEST_P(WithBoundScheduler, UnboundedPool_PolicyPreserve) {
+  CtorDtorCounter::reset();
+  {
+    marl::UnboundedPool<CtorDtorCounter, marl::PoolPolicy::Preserve> pool;
+    int ctor_count;
+    {
+      auto loan = pool.borrow();
+      ASSERT_NE(CtorDtorCounter::ctor_count, 0);
+      ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+      ctor_count = CtorDtorCounter::ctor_count;
+    }
+    ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+    {
+      auto loan = pool.borrow();
+      ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+      ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+    }
+    ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, CtorDtorCounter::dtor_count);
+}
+
+TEST_P(WithBoundScheduler, BoundedPool_PolicyPreserve) {
+  CtorDtorCounter::reset();
+  {
+    marl::BoundedPool<CtorDtorCounter, 10, marl::PoolPolicy::Preserve> pool;
+    int ctor_count;
+    {
+      auto loan = pool.borrow();
+      ASSERT_NE(CtorDtorCounter::ctor_count, 0);
+      ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+      ctor_count = CtorDtorCounter::ctor_count;
+    }
+    ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+    {
+      auto loan = pool.borrow();
+      ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+      ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+    }
+    ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, CtorDtorCounter::dtor_count);
+}
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "osfiber.h"  // Must come first. See osfiber_ucontext.h.
+
+#include "marl/scheduler.h"
+
+#include "marl/debug.h"
+#include "marl/defer.h"
+#include "marl/thread.h"
+#include "marl/trace.h"
+
+#if defined(_WIN32)
+#include <intrin.h>  // __nop()
+#endif
+
+// Enable to trace scheduler events.
+#define ENABLE_TRACE_EVENTS 0
+
+#if ENABLE_TRACE_EVENTS
+#define TRACE(...) MARL_SCOPED_EVENT(__VA_ARGS__)
+#else
+#define TRACE(...)
+#endif
+
+namespace {
+
+template <typename T>
+inline T take(std::queue<T>& queue) {
+  auto out = std::move(queue.front());
+  queue.pop();
+  return out;
+}
+
+inline void nop() {
+#if defined(_WIN32)
+  __nop();
+#else
+  __asm__ __volatile__("nop");
+#endif
+}
+
+}  // anonymous namespace
+
+namespace marl {
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler
+////////////////////////////////////////////////////////////////////////////////
+thread_local Scheduler* Scheduler::bound = nullptr;
+
+Scheduler* Scheduler::get() {
+  return bound;
+}
+
+void Scheduler::bind() {
+  MARL_ASSERT(bound == nullptr, "Scheduler already bound");
+  bound = this;
+  {
+    std::unique_lock<std::mutex> lock(singleThreadedWorkerMutex);
+    auto worker = std::unique_ptr<Worker>(
+        new Worker(this, Worker::Mode::SingleThreaded, 0));
+    worker->start();
+    auto tid = std::this_thread::get_id();
+    singleThreadedWorkers.emplace(tid, std::move(worker));
+  }
+}
+
+void Scheduler::unbind() {
+  MARL_ASSERT(bound != nullptr, "No scheduler bound");
+  std::unique_ptr<Worker> worker;
+  {
+    std::unique_lock<std::mutex> lock(bound->singleThreadedWorkerMutex);
+    auto tid = std::this_thread::get_id();
+    auto it = bound->singleThreadedWorkers.find(tid);
+    MARL_ASSERT(it != bound->singleThreadedWorkers.end(),
+                "singleThreadedWorker not found");
+    worker = std::move(it->second);
+    bound->singleThreadedWorkers.erase(tid);
+  }
+  worker->flush();
+  worker->stop();
+  bound = nullptr;
+}
+
+Scheduler::Scheduler() {
+  for (size_t i = 0; i < spinningWorkers.size(); i++) {
+    spinningWorkers[i] = -1;
+  }
+}
+
+Scheduler::~Scheduler() {
+  {
+    std::unique_lock<std::mutex> lock(singleThreadedWorkerMutex);
+    MARL_ASSERT(singleThreadedWorkers.size() == 0,
+                "Scheduler still bound on %d threads",
+                int(singleThreadedWorkers.size()));
+  }
+  setWorkerThreadCount(0);
+}
+
+void Scheduler::setThreadInitializer(const std::function<void()>& func) {
+  std::unique_lock<std::mutex> lock(threadInitFuncMutex);
+  threadInitFunc = func;
+}
+
+const std::function<void()>& Scheduler::getThreadInitializer() {
+  std::unique_lock<std::mutex> lock(threadInitFuncMutex);
+  return threadInitFunc;
+}
+
+void Scheduler::setWorkerThreadCount(int newCount) {
+  MARL_ASSERT(newCount >= 0, "count must be positive");
+  auto oldCount = numWorkerThreads;
+  for (int idx = oldCount - 1; idx >= newCount; idx--) {
+    workerThreads[idx]->stop();
+  }
+  for (int idx = oldCount - 1; idx >= newCount; idx--) {
+    delete workerThreads[idx];
+  }
+  for (int idx = oldCount; idx < newCount; idx++) {
+    workerThreads[idx] = new Worker(this, Worker::Mode::MultiThreaded, idx);
+  }
+  numWorkerThreads = newCount;
+  for (int idx = oldCount; idx < newCount; idx++) {
+    workerThreads[idx]->start();
+  }
+}
+
+int Scheduler::getWorkerThreadCount() {
+  return numWorkerThreads;
+}
+
+void Scheduler::enqueue(Task&& task) {
+  if (numWorkerThreads > 0) {
+    while (true) {
+      // Prioritize workers that have recently started spinning.
+      auto i = --nextSpinningWorkerIdx % spinningWorkers.size();
+      auto idx = spinningWorkers[i].exchange(-1);
+      if (idx < 0) {
+        // If a spinning worker couldn't be found, round-robin the
+        // workers.
+        idx = nextEnqueueIndex++ % numWorkerThreads;
+      }
+
+      auto worker = workerThreads[idx];
+      if (worker->tryLock()) {
+        worker->enqueueAndUnlock(std::move(task));
+        return;
+      }
+    }
+  } else {
+    auto tid = std::this_thread::get_id();
+    std::unique_lock<std::mutex> lock(singleThreadedWorkerMutex);
+    auto it = singleThreadedWorkers.find(tid);
+    MARL_ASSERT(it != singleThreadedWorkers.end(),
+                "singleThreadedWorker not found");
+    it->second->enqueue(std::move(task));
+  }
+}
+
+bool Scheduler::stealWork(Worker* thief, uint64_t from, Task& out) {
+  if (numWorkerThreads > 0) {
+    auto thread = workerThreads[from % numWorkerThreads];
+    if (thread != thief) {
+      if (thread->dequeue(out)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void Scheduler::onBeginSpinning(int workerId) {
+  auto idx = nextSpinningWorkerIdx++ % spinningWorkers.size();
+  spinningWorkers[idx] = workerId;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Fiber
+////////////////////////////////////////////////////////////////////////////////
+Scheduler::Fiber::Fiber(OSFiber* impl, uint32_t id)
+    : id(id), impl(impl), worker(Scheduler::Worker::getCurrent()) {
+  MARL_ASSERT(worker != nullptr, "No Scheduler::Worker bound");
+}
+
+Scheduler::Fiber::~Fiber() {
+  delete impl;
+}
+
+Scheduler::Fiber* Scheduler::Fiber::current() {
+  auto worker = Scheduler::Worker::getCurrent();
+  return worker != nullptr ? worker->getCurrentFiber() : nullptr;
+}
+
+void Scheduler::Fiber::schedule() {
+  worker->enqueue(this);
+}
+
+void Scheduler::Fiber::yield() {
+  MARL_SCOPED_EVENT("YIELD");
+  worker->yield(this);
+}
+
+void Scheduler::Fiber::switchTo(Fiber* to) {
+  if (to != this) {
+    impl->switchTo(to->impl);
+  }
+}
+
+Scheduler::Fiber* Scheduler::Fiber::create(uint32_t id,
+                                           size_t stackSize,
+                                           const std::function<void()>& func) {
+  return new Fiber(OSFiber::createFiber(stackSize, func), id);
+}
+
+Scheduler::Fiber* Scheduler::Fiber::createFromCurrentThread(uint32_t id) {
+  return new Fiber(OSFiber::createFiberFromCurrentThread(), id);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler::Worker
+////////////////////////////////////////////////////////////////////////////////
+thread_local Scheduler::Worker* Scheduler::Worker::current = nullptr;
+
+Scheduler::Worker::Worker(Scheduler* scheduler, Mode mode, uint32_t id)
+    : id(id), mode(mode), scheduler(scheduler) {}
+
+void Scheduler::Worker::start() {
+  switch (mode) {
+    case Mode::MultiThreaded:
+      thread = std::thread([=] {
+        Thread::setName("Thread<%.2d>", int(id));
+
+        if (auto const& initFunc = scheduler->getThreadInitializer()) {
+          initFunc();
+        }
+
+        Scheduler::bound = scheduler;
+        Worker::current = this;
+        mainFiber.reset(Fiber::createFromCurrentThread(0));
+        currentFiber = mainFiber.get();
+        run();
+        mainFiber.reset();
+        Worker::current = nullptr;
+      });
+      break;
+
+    case Mode::SingleThreaded:
+      Worker::current = this;
+      mainFiber.reset(Fiber::createFromCurrentThread(0));
+      currentFiber = mainFiber.get();
+      break;
+
+    default:
+      MARL_ASSERT(false, "Unknown mode: %d", int(mode));
+  }
+}
+
+void Scheduler::Worker::stop() {
+  switch (mode) {
+    case Mode::MultiThreaded:
+      shutdown = true;
+      enqueue([] {});  // Ensure the worker is woken up to notice the shutdown.
+      thread.join();
+      break;
+
+    case Mode::SingleThreaded:
+      Worker::current = nullptr;
+      break;
+
+    default:
+      MARL_ASSERT(false, "Unknown mode: %d", int(mode));
+  }
+}
+
+void Scheduler::Worker::yield(Fiber* from) {
+  MARL_ASSERT(currentFiber == from,
+              "Attempting to call yield from a non-current fiber");
+
+  // Current fiber is yielding as it is blocked.
+
+  // First wait until there's something else this worker can do.
+  std::unique_lock<std::mutex> lock(work.mutex);
+  waitForWork(lock);
+
+  if (work.fibers.size() > 0) {
+    // There's another fiber that has become unblocked, resume that.
+    work.num--;
+    auto to = take(work.fibers);
+    lock.unlock();
+    switchToFiber(to);
+  } else if (idleFibers.size() > 0) {
+    // There's an old fiber we can reuse, resume that.
+    auto to = take(idleFibers);
+    lock.unlock();
+    switchToFiber(to);
+  } else {
+    // Tasks to process and no existing fibers to resume. Spawn a new fiber.
+    lock.unlock();
+    switchToFiber(createWorkerFiber());
+  }
+}
+
+bool Scheduler::Worker::tryLock() {
+  return work.mutex.try_lock();
+}
+
+void Scheduler::Worker::enqueue(Fiber* fiber) {
+  std::unique_lock<std::mutex> lock(work.mutex);
+  auto wasIdle = work.num == 0;
+  work.fibers.push(std::move(fiber));
+  work.num++;
+  lock.unlock();
+  if (wasIdle) {
+    work.added.notify_one();
+  }
+}
+
+void Scheduler::Worker::enqueue(Task&& task) {
+  work.mutex.lock();
+  enqueueAndUnlock(std::move(task));
+}
+
+void Scheduler::Worker::enqueueAndUnlock(Task&& task) {
+  auto wasIdle = work.num == 0;
+  work.tasks.push(std::move(task));
+  work.num++;
+  work.mutex.unlock();
+  if (wasIdle) {
+    work.added.notify_one();
+  }
+}
+
+bool Scheduler::Worker::dequeue(Task& out) {
+  if (work.num.load() == 0) {
+    return false;
+  }
+  if (!work.mutex.try_lock()) {
+    return false;
+  }
+  defer(work.mutex.unlock());
+  if (work.tasks.size() == 0) {
+    return false;
+  }
+  work.num--;
+  out = take(work.tasks);
+  return true;
+}
+
+void Scheduler::Worker::flush() {
+  MARL_ASSERT(mode == Mode::SingleThreaded,
+              "flush() can only be used on a single-threaded worker");
+  std::unique_lock<std::mutex> lock(work.mutex);
+  runUntilIdle(lock);
+}
+
+void Scheduler::Worker::run() {
+  switch (mode) {
+    case Mode::MultiThreaded: {
+      MARL_NAME_THREAD("Thread<%.2d> Fiber<%.2d>", int(id),
+                       Fiber::current()->id);
+      {
+        std::unique_lock<std::mutex> lock(work.mutex);
+        work.added.wait(lock, [this] { return work.num > 0 || shutdown; });
+        while (!shutdown) {
+          waitForWork(lock);
+          runUntilIdle(lock);
+        }
+        Worker::current = nullptr;
+      }
+      switchToFiber(mainFiber.get());
+      break;
+    }
+    case Mode::SingleThreaded:
+      while (!shutdown) {
+        flush();
+        idleFibers.emplace(currentFiber);
+        switchToFiber(mainFiber.get());
+      }
+      break;
+
+    default:
+      MARL_ASSERT(false, "Unknown mode: %d", int(mode));
+  }
+}
+
+_Requires_lock_held_(lock) void Scheduler::Worker::waitForWork(
+    std::unique_lock<std::mutex>& lock) {
+  MARL_ASSERT(work.num == work.fibers.size() + work.tasks.size(),
+              "work.num out of sync");
+  if (work.num == 0) {
+    scheduler->onBeginSpinning(id);
+    lock.unlock();
+    spinForWork();
+    lock.lock();
+  }
+  work.added.wait(lock, [this] { return work.num > 0 || shutdown; });
+}
+
+void Scheduler::Worker::spinForWork() {
+  TRACE("SPIN");
+  Task stolen;
+
+  constexpr auto duration = std::chrono::milliseconds(1);
+  auto start = std::chrono::high_resolution_clock::now();
+  while (std::chrono::high_resolution_clock::now() - start < duration) {
+    for (int i = 0; i < 256; i++)  // Empirically picked magic number!
+    {
+      // clang-format off
+      nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop();
+      nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop();
+      nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop();
+      nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop();
+      // clang-format on
+      if (work.num > 0) {
+        return;
+      }
+    }
+
+    if (scheduler->stealWork(this, rng(), stolen)) {
+      std::unique_lock<std::mutex> lock(work.mutex);
+      work.tasks.emplace(std::move(stolen));
+      work.num++;
+      return;
+    }
+
+    std::this_thread::yield();
+  }
+}
+
+_Requires_lock_held_(lock) void Scheduler::Worker::runUntilIdle(
+    std::unique_lock<std::mutex>& lock) {
+  MARL_ASSERT(work.num == work.fibers.size() + work.tasks.size(),
+              "work.num out of sync");
+  while (work.fibers.size() > 0 || work.tasks.size() > 0) {
+    // Note: we cannot take and store on the stack more than a single fiber
+    // or task at a time, as the Fiber may yield and these items may get
+    // held on suspended fiber stack.
+
+    while (work.fibers.size() > 0) {
+      work.num--;
+      auto fiber = take(work.fibers);
+      lock.unlock();
+      idleFibers.push(currentFiber);
+      switchToFiber(fiber);
+      lock.lock();
+    }
+
+    if (work.tasks.size() > 0) {
+      work.num--;
+      auto task = take(work.tasks);
+      lock.unlock();
+
+      // Run the task.
+      task();
+
+      // std::function<> can carry arguments with complex destructors.
+      // Ensure these are destructed outside of the lock.
+      task = Task();
+
+      lock.lock();
+    }
+  }
+}
+
+Scheduler::Fiber* Scheduler::Worker::createWorkerFiber() {
+  auto id = workerFibers.size() + 1;
+  auto fiber = Fiber::create(id, FiberStackSize, [&] { run(); });
+  workerFibers.push_back(std::unique_ptr<Fiber>(fiber));
+  return fiber;
+}
+
+void Scheduler::Worker::switchToFiber(Fiber* to) {
+  auto from = currentFiber;
+  currentFiber = to;
+  from->switchTo(to);
+}
+
+}  // namespace marl
--- a/src/scheduler_test.cpp
+++ b/src/scheduler_test.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+#include "marl/waitgroup.h"
+
+TEST(WithoutBoundScheduler, SchedulerConstructAndDestruct) {
+  auto scheduler = new marl::Scheduler();
+  delete scheduler;
+}
+
+TEST(WithoutBoundScheduler, SchedulerBindGetUnbind) {
+  auto scheduler = new marl::Scheduler();
+  scheduler->bind();
+  auto got = marl::Scheduler::get();
+  ASSERT_EQ(scheduler, got);
+  scheduler->unbind();
+  got = marl::Scheduler::get();
+  ASSERT_EQ(got, nullptr);
+  delete scheduler;
+}
+
+TEST_P(WithBoundScheduler, SetAndGetWorkerThreadCount) {
+  ASSERT_EQ(marl::Scheduler::get()->getWorkerThreadCount(),
+            GetParam().numWorkerThreads);
+}
+
+TEST_P(WithBoundScheduler, DestructWithPendingTasks) {
+  for (int i = 0; i < 10000; i++) {
+    marl::schedule([] {});
+  }
+}
+
+TEST_P(WithBoundScheduler, DestructWithPendingFibers) {
+  marl::WaitGroup wg(1);
+  for (int i = 0; i < 10000; i++) {
+    marl::schedule([=] { wg.wait(); });
+  }
+  wg.done();
+
+  auto scheduler = marl::Scheduler::get();
+  scheduler->unbind();
+  delete scheduler;
+
+  // Rebind a new scheduler so WithBoundScheduler::TearDown() is happy.
+  (new marl::Scheduler())->bind();
+}
+
+TEST_P(WithBoundScheduler, FibersResumeOnSameThread) {
+  marl::WaitGroup fence(1);
+  marl::WaitGroup wg(1000);
+  for (int i = 0; i < 1000; i++) {
+    marl::schedule([=] {
+      auto threadID = std::this_thread::get_id();
+      fence.wait();
+      ASSERT_EQ(threadID, std::this_thread::get_id());
+      wg.done();
+    });
+  }
+  // just to try and get some tasks to yield.
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  fence.done();
+  wg.wait();
+}
+
+TEST_P(WithBoundScheduler, FibersResumeOnSameStdThread) {
+  auto scheduler = marl::Scheduler::get();
+
+  marl::WaitGroup fence(1);
+  marl::WaitGroup wg(1000);
+
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 1000; i++) {
+    threads.push_back(std::thread([=] {
+      scheduler->bind();
+
+      auto threadID = std::this_thread::get_id();
+      fence.wait();
+      ASSERT_EQ(threadID, std::this_thread::get_id());
+      wg.done();
+
+      scheduler->unbind();
+    }));
+  }
+  // just to try and get some tasks to yield.
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  fence.done();
+  wg.wait();
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
\ No newline at end of file
--- a/src/thread.cpp
+++ b/src/thread.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/thread.h"
+
+#include "marl/trace.h"
+
+#include <cstdarg>
+#include <cstdio>
+
+#if defined(_WIN32)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#include <cstdlib> // mbstowcs
+#elif defined(__APPLE__)
+#include <mach/thread_act.h>
+#include <pthread.h>
+#include <unistd.h>
+#else
+#include <pthread.h>
+#include <unistd.h>
+#endif
+
+namespace marl {
+
+#if defined(_WIN32)
+
+void Thread::setName(const char* fmt, ...) {
+  static auto setThreadDescription =
+      reinterpret_cast<HRESULT(WINAPI*)(HANDLE, PCWSTR)>(GetProcAddress(
+          GetModuleHandle("kernelbase.dll"), "SetThreadDescription"));
+  if (setThreadDescription == nullptr) {
+    return;
+  }
+
+  char name[1024];
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(name, sizeof(name), fmt, vararg);
+  va_end(vararg);
+
+  wchar_t wname[1024];
+  mbstowcs(wname, name, 1024);
+  setThreadDescription(GetCurrentThread(), wname);
+  MARL_NAME_THREAD("%s", name);
+}
+
+unsigned int Thread::numLogicalCPUs() {
+  DWORD_PTR processAffinityMask = 1;
+  DWORD_PTR systemAffinityMask = 1;
+
+  GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask,
+                         &systemAffinityMask);
+
+  auto count = 0;
+  while (processAffinityMask > 0) {
+    if (processAffinityMask & 1) {
+      count++;
+    }
+
+    processAffinityMask >>= 1;
+  }
+  return count;
+}
+
+#else
+
+void Thread::setName(const char* fmt, ...) {
+  char name[1024];
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(name, sizeof(name), fmt, vararg);
+  va_end(vararg);
+
+#if defined(__APPLE__)
+  pthread_setname_np(name);
+#elif !defined(__Fuchsia__)
+  pthread_setname_np(pthread_self(), name);
+#endif
+
+  MARL_NAME_THREAD("%s", name);
+}
+
+unsigned int Thread::numLogicalCPUs() {
+  return sysconf(_SC_NPROCESSORS_ONLN);
+}
+
+#endif
+
+}  // namespace marl
--- a/src/ticket_test.cpp
+++ b/src/ticket_test.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+#include "marl/ticket.h"
+
+TEST_P(WithBoundScheduler, Ticket) {
+  marl::Ticket::Queue queue;
+
+  constexpr int count = 1000;
+  std::atomic<int> next = {0};
+  int result[count] = {};
+
+  for (int i = 0; i < count; i++) {
+    auto ticket = queue.take();
+    marl::schedule([ticket, i, &result, &next] {
+      ticket.wait();
+      result[next++] = i;
+      ticket.done();
+    });
+  }
+
+  queue.take().wait();
+
+  for (int i = 0; i < count; i++) {
+    ASSERT_EQ(result[i], i);
+  }
+}
--- a/src/trace.cpp
+++ b/src/trace.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The Trace API produces a trace event file that can be consumed with Chrome's
+// about:tracing viewer.
+// Documentation can be found at:
+//   https://www.chromium.org/developers/how-tos/trace-event-profiling-tool
+//   https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit
+
+#include "marl/trace.h"
+
+#include "marl/defer.h"
+#include "marl/scheduler.h"
+#include "marl/thread.h"
+
+#if MARL_TRACE_ENABLED
+
+#include <atomic>
+#include <fstream>
+#include <unordered_set>
+
+namespace {
+
+// Chrome traces can choke or error on very large trace files.
+// Limit the number of events created to this number.
+static constexpr int MaxEvents = 100000;
+
+uint64_t threadFiberID(uint32_t threadID, uint32_t fiberID) {
+  return static_cast<uint64_t>(threadID) * 31 + static_cast<uint64_t>(fiberID);
+}
+
+}  // anonymous namespace
+
+namespace marl {
+
+Trace* Trace::get() {
+  static Trace trace;
+  return &trace;
+}
+
+Trace::Trace() {
+  nameThread("main");
+  thread = std::thread([&] {
+    Thread::setName("Trace worker");
+
+    auto out = std::fstream("chrome.trace", std::ios_base::out);
+
+    out << "[" << std::endl;
+    defer(out << std::endl << "]" << std::endl);
+
+    auto first = true;
+    for (int i = 0; i < MaxEvents; i++) {
+      auto event = take();
+      if (event->type() == Event::Type::Shutdown) {
+        return;
+      }
+      if (!first) {
+        out << "," << std::endl;
+      };
+      first = false;
+      out << "{" << std::endl;
+      event->write(out);
+      out << "}";
+    }
+
+    stopped = true;
+
+    while (take()->type() != Event::Type::Shutdown) {
+    }
+  });
+}
+
+Trace::~Trace() {
+  put(new Shutdown());
+  thread.join();
+}
+
+void Trace::nameThread(const char* fmt, ...) {
+  if (stopped) {
+    return;
+  }
+  auto event = new NameThreadEvent();
+
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(event->name, Trace::MaxEventNameLength, fmt, vararg);
+  va_end(vararg);
+
+  put(event);
+}
+
+void Trace::beginEvent(const char* fmt, ...) {
+  if (stopped) {
+    return;
+  }
+  auto event = new BeginEvent();
+
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(event->name, Trace::MaxEventNameLength, fmt, vararg);
+  va_end(vararg);
+
+  event->timestamp = timestamp();
+  put(event);
+}
+
+void Trace::endEvent() {
+  if (stopped) {
+    return;
+  }
+  auto event = new EndEvent();
+  event->timestamp = timestamp();
+  put(event);
+}
+
+void Trace::beginAsyncEvent(uint32_t id, const char* fmt, ...) {
+  if (stopped) {
+    return;
+  }
+  auto event = new AsyncStartEvent();
+
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(event->name, Trace::MaxEventNameLength, fmt, vararg);
+  va_end(vararg);
+
+  event->timestamp = timestamp();
+  event->id = id;
+  put(event);
+}
+
+void Trace::endAsyncEvent(uint32_t id, const char* fmt, ...) {
+  if (stopped) {
+    return;
+  }
+  auto event = new AsyncEndEvent();
+
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(event->name, Trace::MaxEventNameLength, fmt, vararg);
+  va_end(vararg);
+
+  event->timestamp = timestamp();
+  event->id = id;
+  put(event);
+}
+
+uint64_t Trace::timestamp() {
+  auto now = std::chrono::high_resolution_clock::now();
+  auto diff =
+      std::chrono::duration_cast<std::chrono::microseconds>(now - createdAt);
+  return static_cast<uint64_t>(diff.count());
+}
+
+void Trace::put(Event* event) {
+  auto idx = eventQueueWriteIdx++ % eventQueues.size();
+  auto& queue = eventQueues[idx];
+  std::unique_lock<std::mutex> lock(queue.mutex);
+  auto notify = queue.data.size() == 0;
+  queue.data.push(std::unique_ptr<Event>(event));
+  lock.unlock();
+  if (notify) {
+    queue.condition.notify_one();
+  }
+}
+
+std::unique_ptr<Trace::Event> Trace::take() {
+  auto idx = eventQueueReadIdx++ % eventQueues.size();
+  auto& queue = eventQueues[idx];
+  std::unique_lock<std::mutex> lock(queue.mutex);
+  queue.condition.wait(lock, [&queue] { return queue.data.size() > 0; });
+  auto event = std::move(queue.data.front());
+  queue.data.pop();
+  return event;
+}
+
+#define QUOTE(x) "\"" << x << "\""
+#define INDENT "  "
+
+Trace::Event::Event()
+    : threadID(std::hash<std::thread::id>()(std::this_thread::get_id())) {
+  if (auto fiber = Scheduler::Fiber::current()) {
+    fiberID = fiber->id;
+  }
+}
+
+void Trace::Event::write(std::ostream& out) const {
+  out << INDENT << QUOTE("name") << ": " << QUOTE(name) << "," << std::endl;
+  if (categories != nullptr) {
+    out << INDENT << QUOTE("cat") << ": "
+        << "\"";
+    auto first = true;
+    for (auto category = *categories; category != nullptr; category++) {
+      if (!first) {
+        out << ",";
+      }
+      out << category;
+    }
+    out << "\"," << std::endl;
+  }
+  if (fiberID != 0) {
+    out << INDENT << QUOTE("args") << ": "
+        << "{" << std::endl
+        << INDENT << INDENT << QUOTE("fiber") << ": " << fiberID << std::endl
+        << INDENT << "}," << std::endl;
+  }
+  if (threadID != 0) {
+    out << INDENT << QUOTE("tid") << ": " << threadFiberID(threadID, fiberID)
+        << "," << std::endl;
+  }
+  out << INDENT << QUOTE("ph") << ": " << QUOTE(static_cast<char>(type()))
+      << "," << std::endl
+      << INDENT << QUOTE("pid") << ": " << processID << "," << std::endl
+      << INDENT << QUOTE("ts") << ": " << timestamp << std::endl;
+}
+
+void Trace::NameThreadEvent::write(std::ostream& out) const {
+  out << INDENT << QUOTE("name") << ": " << QUOTE("thread_name") << ","
+      << std::endl
+      << INDENT << QUOTE("ph") << ": " << QUOTE("M") << "," << std::endl
+      << INDENT << QUOTE("pid") << ": " << processID << "," << std::endl
+      << INDENT << QUOTE("tid") << ": " << threadFiberID(threadID, fiberID)
+      << "," << std::endl
+      << INDENT << QUOTE("args") << ": {" << QUOTE("name") << ": "
+      << QUOTE(name) << "}" << std::endl;
+}
+
+void Trace::AsyncEvent::write(std::ostream& out) const {
+  out << INDENT << QUOTE("id") << ": " << QUOTE(id) << "," << std::endl
+      << INDENT << QUOTE("cat") << ": " << QUOTE("async") << "," << std::endl;
+  Event::write(out);
+}
+
+}  // namespace marl
+
+#endif  // MARL_TRACE_ENABLED
\ No newline at end of file
--- a/src/waitgroup_test.cpp
+++ b/src/waitgroup_test.cpp
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+#include "marl/waitgroup.h"
+
+TEST(WithoutBoundScheduler, WaitGroupDone) {
+  marl::WaitGroup wg(2);  // Should not require a scheduler.
+  wg.done();
+  wg.done();
+}
+
+#if MARL_DEBUG_ENABLED
+TEST(WithoutBoundScheduler, WaitGroupDoneTooMany) {
+  marl::WaitGroup wg(2);  // Should not require a scheduler.
+  wg.done();
+  wg.done();
+  EXPECT_DEATH(wg.done(), "done\\(\\) called too many times");
+}
+#endif  // MARL_DEBUG_ENABLED
+
+TEST_P(WithBoundScheduler, WaitGroup_OneTask) {
+  marl::WaitGroup wg(1);
+  std::atomic<int> counter = {0};
+  marl::schedule([&counter, wg] {
+    counter++;
+    wg.done();
+  });
+  wg.wait();
+  ASSERT_EQ(counter.load(), 1);
+}
+
+TEST_P(WithBoundScheduler, WaitGroup_10Tasks) {
+  marl::WaitGroup wg(10);
+  std::atomic<int> counter = {0};
+  for (int i = 0; i < 10; i++) {
+    marl::schedule([&counter, wg] {
+      counter++;
+      wg.done();
+    });
+  }
+  wg.wait();
+  ASSERT_EQ(counter.load(), 10);
+}
--- a/googletest @ fb49e6c1
+++ b/googletest @ fb49e6c1
+Subproject commit fb49e6c164490a227bbb7cf5223b846c836a0305