Subzero. Native 64-bit int arithmetic on x86-64.

This CL modifies the x86 instruction selection template to allow native 64-bit GPR support. It also enables x86-64 crosstests. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4077 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1273153002.

Subzero. Native 64-bit int arithmetic on x86-64.
1d235425 · John Porto · 83ccadcf · 1d235425 · 1d235425 · 1d235425
Commit 1d235425 authored Aug 12, 2015 by John Porto
49 changed files
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -325,6 +325,7 @@ $(OBJDIR)/unittest/AssemblerX8664: $(OBJDIR)/unittest
 RT_SRC := runtime/szrt.c runtime/szrt_ll.ll runtime/szrt_profiler.c
 RT_OBJ := build/runtime/szrt_native_x8632.o build/runtime/szrt_sb_x8632.o \
+	build/runtime/szrt_native_x8664.o build/runtime/szrt_sb_x8664.o \
 	build/runtime/szrt_native_arm32.o build/runtime/szrt_sb_arm32.o
 runtime: $(RT_OBJ)
@@ -348,10 +349,13 @@ else
 check-xtest: $(OBJDIR)/pnacl-sz make_symlink runtime
       # Do all native/sse2 tests, but only test_vector_ops for native/sse4.1.
       # For (slow) sandboxed tests, limit to Om1/sse4.1.
+       # TODO(jpp): implement x8664 sandbox, then enable xtests.
 	./pydir/crosstest_generator.py -v --lit \
 	  --toolchain-root $(TOOLCHAIN_ROOT) \
 	  -i x8632,native,sse2 -i x8632,native,sse4.1,test_vector_ops \
 	  -i x8632,sandbox,sse4.1,Om1 \
+	  -i x8664,native,sse2 -i x8664,native,sse4.1,test_vector_ops \
+	  -e x8664,native,sse2,test_global \
 	  -i arm32,native,neon,simple_loop \
 	  -i arm32,native,neon,mem_intrin \
 	  -i arm32,native,neon,test_bitmanip \

--- a/crosstest/mem_intrin.cpp
+++ b/crosstest/mem_intrin.cpp
@@ -8,6 +8,7 @@
 #include <cstring>
 #include "mem_intrin.h"
+#include "xdefs.h"
 typedef int elem_t;
@@ -15,9 +16,9 @@ typedef int elem_t;
 * Reset buf to the sequence of bytes: n, n+1, n+2 ... length - 1
 */
 static void __attribute__((noinline))
-reset_buf(uint8_t *buf, uint8_t init, size_t length) {
+reset_buf(uint8_t *buf, uint8_t init, SizeT length) {
-  size_t i;
+  SizeT i;
-  size_t v = init;
+  SizeT v = init;
  for (i = 0; i < length; ++i)
    buf[i] = v++;
 }
@@ -27,8 +28,8 @@ reset_buf(uint8_t *buf, uint8_t init, size_t length) {
 * smaller buffers, whose total won't approach 2**16).
 */
 static int __attribute__((noinline))
-fletcher_checksum(uint8_t *buf, size_t length) {
+fletcher_checksum(uint8_t *buf, SizeT length) {
-  size_t i;
+  SizeT i;
  int sum = 0;
  int sum_of_sums = 0;
  const int kModulus = 255;
@@ -63,20 +64,20 @@ int memset_test_fixed_len(uint8_t init) {
  return fletcher_checksum((uint8_t *)buf, BYTE_LENGTH);
 }
-int memcpy_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length) {
+int memcpy_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length) {
  reset_buf(buf, init, length);
  memcpy((void *)buf2, (void *)buf, length);
  return fletcher_checksum(buf2, length);
 }
-int memmove_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length) {
+int memmove_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length) {
  int sum1;
  int sum2;
  const int overlap_bytes = 4 * sizeof(elem_t);
  if (length <= overlap_bytes)
    return 0;
  uint8_t *overlap_buf = buf + overlap_bytes;
-  size_t reduced_length = length - overlap_bytes;
+  SizeT reduced_length = length - overlap_bytes;
  reset_buf(buf, init, length);
  /* Test w/ overlap. */
@@ -88,7 +89,7 @@ int memmove_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length) {
  return sum1 + sum2;
 }
-int memset_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length) {
+int memset_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length) {
  memset((void *)buf, init, length);
  memset((void *)buf2, init + 4, length);
  return fletcher_checksum(buf, length) + fletcher_checksum(buf2, length);

--- a/crosstest/mem_intrin.h
+++ b/crosstest/mem_intrin.h
@@ -4,10 +4,11 @@
 * There is no include guard since this will be included multiple times,
 * under different namespaces.
 */
+#include "xdefs.h"
-int memcpy_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length);
+int memcpy_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length);
-int memmove_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length);
+int memmove_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length);
-int memset_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length);
+int memset_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length);
 int memcpy_test_fixed_len(uint8_t init);
 int memmove_test_fixed_len(uint8_t init);

--- a/crosstest/mem_intrin_main.cpp
+++ b/crosstest/mem_intrin_main.cpp
@@ -5,6 +5,8 @@
 #include <cstdio>
 #include "mem_intrin.h"
+#include "xdefs.h"
 namespace Subzero_ {
 #include "mem_intrin.h"
 }
@@ -12,7 +14,7 @@ namespace Subzero_ {
 #define XSTR(s) STR(s)
 #define STR(s) #s
-void testFixedLen(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+void testFixedLen(SizeT &TotalTests, SizeT &Passes, SizeT &Failures) {
 #define do_test_fixed(test_func)                                               \
  for (uint8_t init_val = 0; init_val < 100; ++init_val) {                     \
    ++TotalTests;                                                              \
@@ -33,11 +35,11 @@ void testFixedLen(size_t &TotalTests, size_t &Passes, size_t &Failures) {
 #undef do_test_fixed
 }
-void testVariableLen(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+void testVariableLen(SizeT &TotalTests, SizeT &Passes, SizeT &Failures) {
  uint8_t buf[256];
  uint8_t buf2[256];
 #define do_test_variable(test_func)                                            \
-  for (size_t len = 4; len < 128; ++len) {                                     \
+  for (SizeT len = 4; len < 128; ++len) {                                      \
    for (uint8_t init_val = 0; init_val < 100; ++init_val) {                   \
      ++TotalTests;                                                            \
      int llc_result = test_func(buf, buf2, init_val, len);                    \
@@ -58,7 +60,11 @@ void testVariableLen(size_t &TotalTests, size_t &Passes, size_t &Failures) {
 #undef do_test_variable
 }
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  unsigned TotalTests = 0;
  unsigned Passes = 0;
  unsigned Failures = 0;

--- a/crosstest/simple_loop_main.c
+++ b/crosstest/simple_loop_main.c
@@ -6,7 +6,11 @@
 int simple_loop(int *a, int n);
 int Subzero_simple_loop(int *a, int n);
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  unsigned TotalTests = 0;
  unsigned Passes = 0;
  unsigned Failures = 0;

--- a/crosstest/stack_hack.x8664.c
+++ b/crosstest/stack_hack.x8664.c
+//===- subzero/crosstest/stack_hack.x8664.c - X8664 stack hack ------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements main() for crosstests in x86-64.
+//
+//===----------------------------------------------------------------------===//
+#include <assert.h>
+#include <stdint.h>
+#include <sys/mman.h>
+// X8664_STACK_HACK needs to be defined before xdefs.h is included.
+#define X8664_STACK_HACK
+#include "xdefs.h"
+/// xSetStack is used to set %rsp to NewRsp. OldRsp is a pointer that will be
+/// used to save the old %rsp value.
+#define xSetStack(NewRsp, OldRsp)                                              \
+  do {                                                                         \
+    __asm__ volatile("xchgq   %1, %%rsp\n\t"                                   \
+                     "xchgq   %1, %0"                                          \
+                     : "=r"(*(OldRsp))                                         \
+                     : "r"(NewRsp));                                           \
+  } while (0)
+extern int wrapped_main(int argc, char *argv[]);
+unsigned char *xStackStart(uint32 StackEnd, uint32 Size) {
+  const uint32 PageBoundary = 4 << 20; // 4 MB.
+  const uint64 StackStart = StackEnd - Size;
+  assert(StackStart + (PageBoundary - 1) & ~(PageBoundary - 1) &&
+         "StackStart not aligned to page boundary.");
+  (void)PageBoundary;
+  assert((StackStart & 0xFFFFFFFF00000000ull) == 0 && "StackStart wraps.");
+  return (unsigned char *)StackStart;
+}
+unsigned char *xAllocStack(uint64 StackEnd, uint32 Size) {
+  assert((StackEnd & 0xFFFFFFFF00000000ull) == 0 && "Invalid StackEnd.");
+  void *Stack =
+      mmap(xStackStart(StackEnd, Size), Size, PROT_READ | PROT_WRITE,
+           MAP_FIXED | MAP_PRIVATE | MAP_GROWSDOWN | MAP_ANONYMOUS, -1, 0);
+  assert(Stack != MAP_FAILED && "mmap failed. no stack.");
+  return Stack;
+}
+void xDeallocStack(uint64 StackEnd, uint32 Size) {
+  assert((StackEnd & 0xFFFFFFFF00000000ull) == 0 && "Invalid StackEnd.");
+  munmap(xStackStart(StackEnd, Size), Size);
+}
+int main(int argc, char *argv[]) {
+  // These "locals" need to live **NOT** in the stack.
+  static int Argc;
+  static char **Argv;
+  static const uint32_t StackEnd = 0x80000000;
+  static const uint32_t StackSize = 40 * 1024 * 1024;
+  static unsigned char *new_rsp;
+  static unsigned char *old_rsp;
+  static unsigned char *dummy_rsp;
+  static int Failures;
+  Argc = argc;
+  Argv = argv;
+  new_rsp = xAllocStack(StackEnd, StackSize) + StackSize;
+  xSetStack(new_rsp, &old_rsp);
+  Failures = wrapped_main(Argc, Argv);
+  xSetStack(old_rsp, &new_rsp);
+  xDeallocStack(StackEnd, StackSize);
+  return Failures;
+}
--- a/crosstest/test_arith.cpp
+++ b/crosstest/test_arith.cpp
@@ -17,13 +17,14 @@
 #include <stdint.h>
 #include "test_arith.h"
+#include "xdefs.h"
 #define X(inst, op, isdiv, isshift)                                            \
  bool test##inst(bool a, bool b) { return a op b; }                           \
  uint8_t test##inst(uint8_t a, uint8_t b) { return a op b; }                  \
  uint16_t test##inst(uint16_t a, uint16_t b) { return a op b; }               \
  uint32_t test##inst(uint32_t a, uint32_t b) { return a op b; }               \
-  uint64_t test##inst(uint64_t a, uint64_t b) { return a op b; }               \
+  uint64 test##inst(uint64 a, uint64 b) { return a op b; }                     \
  v4ui32 test##inst(v4ui32 a, v4ui32 b) { return a op b; }                     \
  v8ui16 test##inst(v8ui16 a, v8ui16 b) { return a op b; }                     \
  v16ui8 test##inst(v16ui8 a, v16ui8 b) { return a op b; }
@@ -35,7 +36,7 @@ UINTOP_TABLE
  myint8_t test##inst(myint8_t a, myint8_t b) { return a op b; }               \
  int16_t test##inst(int16_t a, int16_t b) { return a op b; }                  \
  int32_t test##inst(int32_t a, int32_t b) { return a op b; }                  \
-  int64_t test##inst(int64_t a, int64_t b) { return a op b; }                  \
+  int64 test##inst(int64 a, int64 b) { return a op b; }                        \
  v4si32 test##inst(v4si32 a, v4si32 b) { return a op b; }                     \
  v8si16 test##inst(v8si16 a, v8si16 b) { return a op b; }                     \
  v16si8 test##inst(v16si8 a, v16si8 b) { return a op b; }

--- a/crosstest/test_arith.h
+++ b/crosstest/test_arith.h
@@ -14,6 +14,7 @@
 #include <stdint.h>
 #include "test_arith.def"
+#include "xdefs.h"
 #include "vectors.h"
@@ -22,7 +23,7 @@
  uint8_t test##inst(uint8_t a, uint8_t b);                                    \
  uint16_t test##inst(uint16_t a, uint16_t b);                                 \
  uint32_t test##inst(uint32_t a, uint32_t b);                                 \
-  uint64_t test##inst(uint64_t a, uint64_t b);                                 \
+  uint64 test##inst(uint64 a, uint64 b);                                       \
  v4ui32 test##inst(v4ui32 a, v4ui32 b);                                       \
  v8ui16 test##inst(v8ui16 a, v8ui16 b);                                       \
  v16ui8 test##inst(v16ui8 a, v16ui8 b);
@@ -34,7 +35,7 @@ UINTOP_TABLE
  myint8_t test##inst(myint8_t a, myint8_t b);                                 \
  int16_t test##inst(int16_t a, int16_t b);                                    \
  int32_t test##inst(int32_t a, int32_t b);                                    \
-  int64_t test##inst(int64_t a, int64_t b);                                    \
+  int64 test##inst(int64 a, int64 b);                                          \
  v4si32 test##inst(v4si32 a, v4si32 b);                                       \
  v8si16 test##inst(v8si16 a, v8si16 b);                                       \
  v16si8 test##inst(v16si8 a, v16si8 b);

--- a/crosstest/test_arith_main.cpp
+++ b/crosstest/test_arith_main.cpp
@@ -28,6 +28,8 @@
 // Subzero_ namespace, corresponding to the llc and Subzero translated
 // object files, respectively.
 #include "test_arith.h"
+#include "xdefs.h"
 namespace Subzero_ {
 #include "test_arith.h"
 }
@@ -363,7 +365,11 @@ void testsVecFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
  }
 }
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;
@@ -372,7 +378,7 @@ int main(int argc, char **argv) {
  testsInt<uint8_t, myint8_t>(TotalTests, Passes, Failures);
  testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures);
  testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures);
-  testsInt<uint64_t, int64_t>(TotalTests, Passes, Failures);
+  testsInt<uint64, int64>(TotalTests, Passes, Failures);
  testsVecInt<v4ui32, v4si32>(TotalTests, Passes, Failures);
  testsVecInt<v8ui16, v8si16>(TotalTests, Passes, Failures);
  testsVecInt<v16ui8, v16si8>(TotalTests, Passes, Failures);

--- a/crosstest/test_bitmanip.def
+++ b/crosstest/test_bitmanip.def
@@ -14,6 +14,8 @@
 #ifndef TEST_BIT_MANIP_DEF
 #define TEST_BIT_MANIP_DEF
+#include "xdefs.h"
 #define STR(s) #s
 #define BMI_OPS  \
@@ -25,13 +27,13 @@
 #define BMI_TYPES \
  /* type */      \
-  X(uint32_t)     \
+  X(uint32)     \
-  X(uint64_t)
+  X(uint64)
 // #define X(type)
 #define FOR_ALL_BMI_TYPES_INST(F, inst) \
-  F(inst, uint32_t)                     \
+  F(inst, uint32)                     \
-  F(inst, uint64_t)
+  F(inst, uint64)
 #define FOR_ALL_BMI_OP_TYPES(X) \
  FOR_ALL_BMI_TYPES_INST(X, ctlz)     \
@@ -42,7 +44,7 @@
 #define BSWAP_TABLE              \
  /* type, builtin_name */       \
  X(uint16_t, __builtin_bswap16) \
-  X(uint32_t, __builtin_bswap32) \
+  X(uint32, __builtin_bswap32) \
-  X(uint64_t, __builtin_bswap64)
+  X(uint64, __builtin_bswap64)
 #endif // TEST_BIT_MANIP_DEF
--- a/crosstest/test_bitmanip_main.cpp
+++ b/crosstest/test_bitmanip_main.cpp
@@ -23,11 +23,13 @@
 // Subzero_ namespace, corresponding to the llc and Subzero translated
 // object files, respectively.
 #include "test_bitmanip.h"
+#include "xdefs.h"
 namespace Subzero_ {
 #include "test_bitmanip.h"
 }
-volatile uint64_t Values[] = {
+volatile uint64 Values[] = {
    0, 1, 0x7e, 0x7f, 0x80, 0x81, 0xfe, 0xff, 0x7ffe, 0x7fff, 0x8000, 0x8001,
    0xfffe, 0xffff, 0xc0de, 0xabcd, 0xdcba, 0x007fffff /*Max subnormal + */,
    0x00800000 /*Min+ */, 0x7f7fffff /*Max+ */, 0x7f800000 /*+Inf*/,
@@ -71,9 +73,9 @@ void testBitManip(size_t &TotalTests, size_t &Passes, size_t &Failures) {
      } else {
        ++Failures;
        std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
-                  << "(" << static_cast<uint64_t>(Value)
+                  << "(" << static_cast<uint64>(Value)
-                  << "): sz=" << static_cast<uint64_t>(ResultSz)
+                  << "): sz=" << static_cast<uint64>(ResultSz)
-                  << " llc=" << static_cast<uint64_t>(ResultLlc) << "\n";
+                  << " llc=" << static_cast<uint64>(ResultLlc) << "\n";
      }
    }
  }
@@ -101,24 +103,28 @@ void testByteSwap(size_t &TotalTests, size_t &Passes, size_t &Failures) {
      } else {
        ++Failures;
        std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
-                  << "(" << static_cast<uint64_t>(Value)
+                  << "(" << static_cast<uint64>(Value)
-                  << "): sz=" << static_cast<uint64_t>(ResultSz)
+                  << "): sz=" << static_cast<uint64>(ResultSz)
-                  << " llc=" << static_cast<uint64_t>(ResultLlc) << "\n";
+                  << " llc=" << static_cast<uint64>(ResultLlc) << "\n";
      }
    }
  }
 }
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;
  testBitManip<uint32_t>(TotalTests, Passes, Failures);
-  testBitManip<uint64_t>(TotalTests, Passes, Failures);
+  testBitManip<uint64>(TotalTests, Passes, Failures);
  testByteSwap<uint16_t>(TotalTests, Passes, Failures);
  testByteSwap<uint32_t>(TotalTests, Passes, Failures);
-  testByteSwap<uint64_t>(TotalTests, Passes, Failures);
+  testByteSwap<uint64>(TotalTests, Passes, Failures);
  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
            << " Failures=" << Failures << "\n";

--- a/crosstest/test_calling_conv.cpp
+++ b/crosstest/test_calling_conv.cpp
@@ -17,6 +17,7 @@
 #include <cstring>
 #include "test_calling_conv.h"
+#include "xdefs.h"
 #define CALL_AS_TYPE(Ty, Func) (reinterpret_cast<Ty *>(Func))
@@ -37,9 +38,9 @@ void caller_vvvvv(void) {
 void caller_vlvlivfvdviv(void) {
  v4f32 arg1 = {0, 1, 2, 3};
-  int64_t arg2 = 4;
+  int64 arg2 = 4;
  v4f32 arg3 = {6, 7, 8, 9};
-  int64_t arg4 = 10;
+  int64 arg4 = 10;
  int arg5 = 11;
  v4f32 arg6 = {12, 13, 14, 15};
  float arg7 = 16;
@@ -75,8 +76,8 @@ callee_vvvvv(v4si32 arg1, v4si32 arg2, v4si32 arg3, v4si32 arg4, v4si32 arg5) {
 }
 void __attribute__((noinline))
-callee_vlvlivfvdviv(v4f32 arg1, int64_t arg2, v4f32 arg3, int64_t arg4,
+callee_vlvlivfvdviv(v4f32 arg1, int64 arg2, v4f32 arg3, int64 arg4, int arg5,
-                    int arg5, v4f32 arg6, float arg7, v4f32 arg8, double arg9,
+                    v4f32 arg6, float arg7, v4f32 arg8, double arg9,
                    v4f32 arg10, int arg11, v4f32 arg12) {
  switch (ArgNum) {
    HANDLE_ARG(1);

--- a/crosstest/test_calling_conv.h
+++ b/crosstest/test_calling_conv.h
@@ -14,6 +14,7 @@
 #include "test_calling_conv.def"
 #include "vectors.h"
+#include "xdefs.h"
 typedef void (*CalleePtrTy)();
 extern CalleePtrTy Callee;
@@ -31,6 +32,6 @@ typedef void(callee_vvvvv_Ty)(v4si32, v4si32, v4si32, v4si32, v4si32);
 callee_vvvvv_Ty callee_vvvvv;
 void caller_vlvlivfvdviv();
-typedef void(callee_vlvlivfvdviv_Ty)(v4f32, int64_t, v4f32, int64_t, int, v4f32,
+typedef void(callee_vlvlivfvdviv_Ty)(v4f32, int64, v4f32, int64, int, v4f32,
                                     float, v4f32, double, v4f32, int, v4f32);
 callee_vlvlivfvdviv_Ty callee_vlvlivfvdviv;
--- a/crosstest/test_calling_conv_main.cpp
+++ b/crosstest/test_calling_conv_main.cpp
@@ -162,7 +162,11 @@ void testCallee(size_t &TotalTests, size_t &Passes, size_t &Failures) {
  }
 }
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
 int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;

--- a/crosstest/test_cast.cpp
+++ b/crosstest/test_cast.cpp
@@ -16,6 +16,7 @@
 #include <stdint.h>
 #include "test_cast.h"
+#include "xdefs.h"
 template <typename FromType, typename ToType>
 ToType __attribute__((noinline)) cast(FromType a) {
@@ -38,8 +39,8 @@ template <typename ToType> class Caster {
  static ToType f(uint16_t a) { return cast<uint16_t, ToType>(a); }
  static ToType f(int32_t a) { return cast<int32_t, ToType>(a); }
  static ToType f(uint32_t a) { return cast<uint32_t, ToType>(a); }
-  static ToType f(int64_t a) { return cast<int64_t, ToType>(a); }
+  static ToType f(int64 a) { return cast<int64, ToType>(a); }
-  static ToType f(uint64_t a) { return cast<uint64_t, ToType>(a); }
+  static ToType f(uint64 a) { return cast<uint64, ToType>(a); }
  static ToType f(float a) { return cast<float, ToType>(a); }
  static ToType f(double a) { return cast<double, ToType>(a); }
 };
@@ -56,8 +57,8 @@ template class Caster<int16_t>;
 template class Caster<uint16_t>;
 template class Caster<int32_t>;
 template class Caster<uint32_t>;
-template class Caster<int64_t>;
+template class Caster<int64>;
-template class Caster<uint64_t>;
+template class Caster<uint64>;
 template class Caster<float>;
 template class Caster<double>;
@@ -67,8 +68,8 @@ template class Caster<double>;
 double makeBitCasters() {
  double Result = 0;
  Result += castBits<uint32_t, float>(0);
-  Result += castBits<uint64_t, double>(0);
+  Result += castBits<uint64, double>(0);
  Result += castBits<float, uint32_t>(0);
-  Result += castBits<double, uint64_t>(0);
+  Result += castBits<double, uint64>(0);
  return Result;
 }
--- a/crosstest/test_cast_main.cpp
+++ b/crosstest/test_cast_main.cpp
@@ -22,6 +22,7 @@
 #include "test_arith.def"
 #include "vectors.h"
+#include "xdefs.h"
 // Include test_cast.h twice - once normally, and once within the
 // Subzero_ namespace, corresponding to the llc and Subzero translated
@@ -82,8 +83,8 @@ void testValue(FromType Val, size_t &TotalTests, size_t &Passes,
  COMPARE(cast, FromType, int16_t, Val, FromTypeString);
  COMPARE(cast, FromType, uint32_t, Val, FromTypeString);
  COMPARE(cast, FromType, int32_t, Val, FromTypeString);
-  COMPARE(cast, FromType, uint64_t, Val, FromTypeString);
+  COMPARE(cast, FromType, uint64, Val, FromTypeString);
-  COMPARE(cast, FromType, int64_t, Val, FromTypeString);
+  COMPARE(cast, FromType, int64, Val, FromTypeString);
  COMPARE(cast, FromType, float, Val, FromTypeString);
  COMPARE(cast, FromType, double, Val, FromTypeString);
 }
@@ -110,7 +111,11 @@ void testVector(size_t &TotalTests, size_t &Passes, size_t &Failures,
  }
 }
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;
@@ -147,7 +152,7 @@ int main(int argc, char **argv) {
                                0x80000000, 0x80000001, 0xfffffffe, 0xffffffff};
  static const size_t NumValsSi32 = sizeof(ValsSi32) / sizeof(*ValsSi32);
-  volatile uint64_t ValsUi64[] = {
+  volatile uint64 ValsUi64[] = {
      0, 1, 0x7e, 0x7f, 0x80, 0x81, 0xfe, 0xff, 0x7ffe, 0x7fff, 0x8000, 0x8001,
      0xfffe, 0xffff, 0x7ffffffe, 0x7fffffff, 0x80000000, 0x80000001,
      0xfffffffe, 0xffffffff, 0x100000000ull, 0x100000001ull,
@@ -155,7 +160,7 @@ int main(int argc, char **argv) {
      0x8000000000000001ull, 0xfffffffffffffffeull, 0xffffffffffffffffull};
  static const size_t NumValsUi64 = sizeof(ValsUi64) / sizeof(*ValsUi64);
-  volatile int64_t ValsSi64[] = {
+  volatile int64 ValsSi64[] = {
      0, 1, 0x7e, 0x7f, 0x80, 0x81, 0xfe, 0xff, 0x7ffe, 0x7fff, 0x8000, 0x8001,
      0xfffe, 0xffff, 0x7ffffffe, 0x7fffffff, 0x80000000, 0x80000001,
      0xfffffffe, 0xffffffff, 0x100000000ll, 0x100000001ll,
@@ -203,13 +208,13 @@ int main(int argc, char **argv) {
    testValue<int32_t>(Val, TotalTests, Passes, Failures, "int32_t");
  }
  for (size_t i = 0; i < NumValsUi64; ++i) {
-    uint64_t Val = ValsUi64[i];
+    uint64 Val = ValsUi64[i];
-    testValue<uint64_t>(Val, TotalTests, Passes, Failures, "uint64_t");
+    testValue<uint64>(Val, TotalTests, Passes, Failures, "uint64");
-    COMPARE(castBits, uint64_t, double, Val, "uint64_t");
+    COMPARE(castBits, uint64, double, Val, "uint64");
  }
  for (size_t i = 0; i < NumValsSi64; ++i) {
-    int64_t Val = ValsSi64[i];
+    int64 Val = ValsSi64[i];
-    testValue<int64_t>(Val, TotalTests, Passes, Failures, "int64_t");
+    testValue<int64>(Val, TotalTests, Passes, Failures, "int64");
  }
  for (size_t i = 0; i < NumValsF32; ++i) {
    for (unsigned j = 0; j < 2; ++j) {
@@ -226,7 +231,7 @@ int main(int argc, char **argv) {
      if (j > 0)
        Val = -Val;
      testValue<double>(Val, TotalTests, Passes, Failures, "double");
-      COMPARE(castBits, double, uint64_t, Val, "double");
+      COMPARE(castBits, double, uint64, Val, "double");
    }
  }
  testVector<v4ui32, v4f32>(TotalTests, Passes, Failures, "v4ui32", "v4f32");

--- a/crosstest/test_fcmp_main.cpp
+++ b/crosstest/test_fcmp_main.cpp
@@ -159,7 +159,11 @@ void testsVector(size_t &TotalTests, size_t &Passes, size_t &Failures) {
  }
 }
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;

--- a/crosstest/test_icmp.cpp
+++ b/crosstest/test_icmp.cpp
@@ -15,12 +15,13 @@
 #include <stdint.h>
 #include "test_icmp.h"
+#include "xdefs.h"
 #define X(cmp, op)                                                             \
  bool icmp##cmp(uint8_t a, uint8_t b) { return a op b; }                      \
  bool icmp##cmp(uint16_t a, uint16_t b) { return a op b; }                    \
  bool icmp##cmp(uint32_t a, uint32_t b) { return a op b; }                    \
-  bool icmp##cmp(uint64_t a, uint64_t b) { return a op b; }                    \
+  bool icmp##cmp(uint64 a, uint64 b) { return a op b; }                        \
  v4ui32 icmp##cmp(v4ui32 a, v4ui32 b) { return a op b; }                      \
  v8ui16 icmp##cmp(v8ui16 a, v8ui16 b) { return a op b; }                      \
  v16ui8 icmp##cmp(v16ui8 a, v16ui8 b) { return a op b; }
@@ -31,7 +32,7 @@ ICMP_U_TABLE
  bool icmp##cmp(myint8_t a, myint8_t b) { return a op b; }                    \
  bool icmp##cmp(int16_t a, int16_t b) { return a op b; }                      \
  bool icmp##cmp(int32_t a, int32_t b) { return a op b; }                      \
-  bool icmp##cmp(int64_t a, int64_t b) { return a op b; }                      \
+  bool icmp##cmp(int64 a, int64 b) { return a op b; }                          \
  v4si32 icmp##cmp(v4si32 a, v4si32 b) { return a op b; }                      \
  v8si16 icmp##cmp(v8si16 a, v8si16 b) { return a op b; }                      \
  v16si8 icmp##cmp(v16si8 a, v16si8 b) { return a op b; }

--- a/crosstest/test_icmp.h
+++ b/crosstest/test_icmp.h
@@ -15,12 +15,13 @@
 #include "test_icmp.def"
 #include "vectors.h"
+#include "xdefs.h"
 #define X(cmp, op)                                                             \
  bool icmp##cmp(uint8_t a, uint8_t b);                                        \
  bool icmp##cmp(uint16_t a, uint16_t b);                                      \
  bool icmp##cmp(uint32_t a, uint32_t b);                                      \
-  bool icmp##cmp(uint64_t a, uint64_t b);                                      \
+  bool icmp##cmp(uint64 a, uint64 b);                                          \
  v4ui32 icmp##cmp(v4ui32 a, v4ui32 b);                                        \
  v8ui16 icmp##cmp(v8ui16 a, v8ui16 b);                                        \
  v16ui8 icmp##cmp(v16ui8 a, v16ui8 b);
@@ -31,7 +32,7 @@ ICMP_U_TABLE
  bool icmp##cmp(myint8_t a, myint8_t b);                                      \
  bool icmp##cmp(int16_t a, int16_t b);                                        \
  bool icmp##cmp(int32_t a, int32_t b);                                        \
-  bool icmp##cmp(int64_t a, int64_t b);                                        \
+  bool icmp##cmp(int64 a, int64 b);                                            \
  v4si32 icmp##cmp(v4si32 a, v4si32 b);                                        \
  v8si16 icmp##cmp(v8si16 a, v8si16 b);                                        \
  v16si8 icmp##cmp(v16si8 a, v16si8 b);

--- a/crosstest/test_icmp_main.cpp
+++ b/crosstest/test_icmp_main.cpp
@@ -23,10 +23,13 @@
 // Subzero_ namespace, corresponding to the llc and Subzero translated
 // object files, respectively.
 #include "test_icmp.h"
 namespace Subzero_ {
 #include "test_icmp.h"
 }
+#include "xdefs.h"
 volatile unsigned Values[] = {
    0x0,        0x1,        0x7ffffffe, 0x7fffffff, 0x80000000, 0x80000001,
    0xfffffffe, 0xffffffff, 0x7e,       0x7f,       0x80,       0x81,
@@ -265,7 +268,11 @@ void testsVecI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
  }
 }
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;
@@ -273,7 +280,7 @@ int main(int argc, char **argv) {
  testsInt<uint8_t, myint8_t>(TotalTests, Passes, Failures);
  testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures);
  testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures);
-  testsInt<uint64_t, int64_t>(TotalTests, Passes, Failures);
+  testsInt<uint64, int64>(TotalTests, Passes, Failures);
  testsVecInt<v4ui32, v4si32>(TotalTests, Passes, Failures);
  testsVecInt<v8ui16, v8si16>(TotalTests, Passes, Failures);
  testsVecInt<v16ui8, v16si8>(TotalTests, Passes, Failures);

--- a/crosstest/test_select_main.cpp
+++ b/crosstest/test_select_main.cpp
@@ -130,7 +130,11 @@ void testSelectI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
  }
 }
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
 int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;

--- a/crosstest/test_stacksave_main.c
+++ b/crosstest/test_stacksave_main.c
@@ -22,7 +22,11 @@
 DECLARE_TESTS()
 DECLARE_TESTS(Subzero_)
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;

--- a/crosstest/test_strengthreduce_main.cpp
+++ b/crosstest/test_strengthreduce_main.cpp
@@ -25,7 +25,11 @@ namespace Subzero_ {
 #include "test_strengthreduce.h"
 }
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;

--- a/crosstest/test_sync_atomic.def
+++ b/crosstest/test_sync_atomic.def
@@ -14,6 +14,8 @@
 #ifndef TEST_SYNC_ATOMIC_DEF
 #define TEST_SYNC_ATOMIC_DEF
+#include "xdefs.h"
 #define STR(s) #s
 #define RMWOP_TABLE  \
@@ -30,14 +32,14 @@
  X(uint8_t)              \
  X(uint16_t)             \
  X(uint32_t)             \
-  X(uint64_t)
+  X(uint64)
 //#define X(type)
 #define FOR_ALL_RMWTYPES_INST(F, inst) \
  F(inst, uint8_t)                     \
  F(inst, uint16_t)                    \
  F(inst, uint32_t)                    \
-  F(inst, uint64_t)
+  F(inst, uint64)
 #define FOR_ALL_RMWOP_TYPES(X)      \
  FOR_ALL_RMWTYPES_INST(X, add)     \

--- a/crosstest/test_sync_atomic_main.cpp
+++ b/crosstest/test_sync_atomic_main.cpp
@@ -28,11 +28,12 @@
 // Subzero_ namespace, corresponding to the llc and Subzero translated
 // object files, respectively.
 #include "test_sync_atomic.h"
+#include "xdefs.h"
 namespace Subzero_ {
 #include "test_sync_atomic.h"
 }
-volatile uint64_t Values[] = {
+volatile uint64 Values[] = {
    0, 1, 0x7e, 0x7f, 0x80, 0x81, 0xfe, 0xff, 0x7ffe, 0x7fff, 0x8000, 0x8001,
    0xfffe, 0xffff, 0x007fffff /*Max subnormal + */, 0x00800000 /*Min+ */,
    0x7f7fffff /*Max+ */, 0x7f800000 /*+Inf*/, 0xff800000 /*-Inf*/,
@@ -51,7 +52,7 @@ struct {
  volatile uint8_t l8;
  volatile uint16_t l16;
  volatile uint32_t l32;
-  volatile uint64_t l64;
+  volatile uint64 l64;
 } AtomicLocs;
 template <typename Type>
@@ -91,12 +92,12 @@ void testAtomicRMW(volatile Type *AtomicLoc, size_t &TotalTests, size_t &Passes,
          } else {
            ++Failures;
            std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
-                      << "(" << static_cast<uint64_t>(Value1) << ", "
+                      << "(" << static_cast<uint64>(Value1) << ", "
-                      << static_cast<uint64_t>(Value2)
+                      << static_cast<uint64>(Value2)
-                      << "): sz1=" << static_cast<uint64_t>(ResultSz1)
+                      << "): sz1=" << static_cast<uint64>(ResultSz1)
-                      << " llc1=" << static_cast<uint64_t>(ResultLlc1)
+                      << " llc1=" << static_cast<uint64>(ResultLlc1)
-                      << " sz2=" << static_cast<uint64_t>(ResultSz2)
+                      << " sz2=" << static_cast<uint64>(ResultSz2)
-                      << " llc2=" << static_cast<uint64_t>(ResultLlc2) << "\n";
+                      << " llc2=" << static_cast<uint64>(ResultLlc2) << "\n";
          }
        }
      }
@@ -137,12 +138,12 @@ void testValCompareAndSwap(volatile Type *AtomicLoc, size_t &TotalTests,
          } else {
            ++Failures;
            std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
-                      << "(" << static_cast<uint64_t>(Value1) << ", "
+                      << "(" << static_cast<uint64>(Value1) << ", "
-                      << static_cast<uint64_t>(Value2)
+                      << static_cast<uint64>(Value2)
-                      << "): sz1=" << static_cast<uint64_t>(ResultSz1)
+                      << "): sz1=" << static_cast<uint64>(ResultSz1)
-                      << " llc1=" << static_cast<uint64_t>(ResultLlc1)
+                      << " llc1=" << static_cast<uint64>(ResultLlc1)
-                      << " sz2=" << static_cast<uint64_t>(ResultSz2)
+                      << " sz2=" << static_cast<uint64>(ResultSz2)
-                      << " llc2=" << static_cast<uint64_t>(ResultLlc2) << "\n";
+                      << " llc2=" << static_cast<uint64>(ResultLlc2) << "\n";
          }
        }
      }
@@ -166,6 +167,22 @@ template <typename Type> void *threadWrapper(void *Data) {
  return NULL;
 }
+#ifndef X8664_STACK_HACK
+void AllocStackForThread(uint32, pthread_attr_t *) {}
+#else  // defined(X8664_STACK_HACK)
+void AllocStackForThread(uint32 m, pthread_attr_t *attr) {
+  static const uint32_t ThreadStackBase = 0x60000000;
+  static const uint32_t ThreadStackSize = 4 << 20; // 4MB.
+  if (pthread_attr_setstack(
+          attr, xAllocStack(ThreadStackBase - 2 * m * ThreadStackSize,
+                            ThreadStackSize),
+          ThreadStackSize) != 0) {
+    std::cout << "pthread_attr_setstack: " << strerror(errno) << "\n";
+    abort();
+  }
+}
+#endif // X8664_STACK_HACK
 template <typename Type>
 void testAtomicRMWThreads(volatile Type *AtomicLoc, size_t &TotalTests,
                          size_t &Passes, size_t &Failures) {
@@ -184,7 +201,7 @@ void testAtomicRMWThreads(volatile Type *AtomicLoc, size_t &TotalTests,
  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
  // Just test a few values, otherwise it takes a *really* long time.
-  volatile uint64_t ValuesSubset[] = {1, 0x7e, 0x000fffffffffffffffll};
+  volatile uint64 ValuesSubset[] = {1, 0x7e, 0x000fffffffffffffffll};
  const size_t NumValuesSubset = sizeof(ValuesSubset) / sizeof(*ValuesSubset);
  for (size_t f = 0; f < NumFuncs; ++f) {
@@ -200,12 +217,18 @@ void testAtomicRMWThreads(volatile Type *AtomicLoc, size_t &TotalTests,
        ++TotalTests;
        const size_t NumThreads = 4;
        pthread_t t[NumThreads];
+        pthread_attr_t attr[NumThreads];
        // Try N threads w/ just Llc.
        *AtomicLoc = Value1;
        for (size_t m = 0; m < NumThreads; ++m) {
-          pthread_create(&t[m], NULL, &threadWrapper<Type>,
+          pthread_attr_init(&attr[m]);
-                         reinterpret_cast<void *>(&TDataLlc));
+          AllocStackForThread(m, &attr[m]);
+          if (pthread_create(&t[m], &attr[m], &threadWrapper<Type>,
+                             reinterpret_cast<void *>(&TDataLlc)) != 0) {
+            std::cout << "pthread_create failed w/ " << strerror(errno) << "\n";
+            abort();
+          }
        }
        for (size_t m = 0; m < NumThreads; ++m) {
          pthread_join(t[m], NULL);
@@ -215,7 +238,9 @@ void testAtomicRMWThreads(volatile Type *AtomicLoc, size_t &TotalTests,
        // Try N threads w/ both Sz and Llc.
        *AtomicLoc = Value1;
        for (size_t m = 0; m < NumThreads; ++m) {
-          if (pthread_create(&t[m], NULL, &threadWrapper<Type>,
+          pthread_attr_init(&attr[m]);
+          AllocStackForThread(m, &attr[m]);
+          if (pthread_create(&t[m], &attr[m], &threadWrapper<Type>,
                             m % 2 == 0
                                 ? reinterpret_cast<void *>(&TDataLlc)
                                 : reinterpret_cast<void *>(&TDataSz)) != 0) {
@@ -238,18 +263,21 @@ void testAtomicRMWThreads(volatile Type *AtomicLoc, size_t &TotalTests,
        } else {
          ++Failures;
          std::cout << "test_with_threads_" << Funcs[f].Name
-                    << (8 * sizeof(Type)) << "("
+                    << (8 * sizeof(Type)) << "(" << static_cast<uint64>(Value1)
-                    << static_cast<uint64_t>(Value1) << ", "
+                    << ", " << static_cast<uint64>(Value2)
-                    << static_cast<uint64_t>(Value2)
+                    << "): llc=" << static_cast<uint64>(ResultLlc)
-                    << "): llc=" << static_cast<uint64_t>(ResultLlc)
+                    << " mixed=" << static_cast<uint64>(ResultMixed) << "\n";
-                    << " mixed=" << static_cast<uint64_t>(ResultMixed) << "\n";
        }
      }
    }
  }
 }
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;
@@ -257,18 +285,17 @@ int main(int argc, char **argv) {
  testAtomicRMW<uint8_t>(&AtomicLocs.l8, TotalTests, Passes, Failures);
  testAtomicRMW<uint16_t>(&AtomicLocs.l16, TotalTests, Passes, Failures);
  testAtomicRMW<uint32_t>(&AtomicLocs.l32, TotalTests, Passes, Failures);
-  testAtomicRMW<uint64_t>(&AtomicLocs.l64, TotalTests, Passes, Failures);
+  testAtomicRMW<uint64>(&AtomicLocs.l64, TotalTests, Passes, Failures);
  testValCompareAndSwap<uint8_t>(&AtomicLocs.l8, TotalTests, Passes, Failures);
  testValCompareAndSwap<uint16_t>(&AtomicLocs.l16, TotalTests, Passes,
                                  Failures);
  testValCompareAndSwap<uint32_t>(&AtomicLocs.l32, TotalTests, Passes,
                                  Failures);
-  testValCompareAndSwap<uint64_t>(&AtomicLocs.l64, TotalTests, Passes,
+  testValCompareAndSwap<uint64>(&AtomicLocs.l64, TotalTests, Passes, Failures);
-                                  Failures);
  testAtomicRMWThreads<uint8_t>(&AtomicLocs.l8, TotalTests, Passes, Failures);
  testAtomicRMWThreads<uint16_t>(&AtomicLocs.l16, TotalTests, Passes, Failures);
  testAtomicRMWThreads<uint32_t>(&AtomicLocs.l32, TotalTests, Passes, Failures);
-  testAtomicRMWThreads<uint64_t>(&AtomicLocs.l64, TotalTests, Passes, Failures);
+  testAtomicRMWThreads<uint64>(&AtomicLocs.l64, TotalTests, Passes, Failures);
  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
            << " Failures=" << Failures << "\n";

--- a/crosstest/test_vector_ops_main.cpp
+++ b/crosstest/test_vector_ops_main.cpp
@@ -130,7 +130,11 @@ void testExtractElement(size_t &TotalTests, size_t &Passes, size_t &Failures) {
  free(TestVectors);
 }
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
 int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
  size_t TotalTests = 0;
  size_t Passes = 0;
  size_t Failures = 0;

--- a/crosstest/xdefs.h
+++ b/crosstest/xdefs.h
+//===- subzero/crosstest/xdefs.h - Definitions for the crosstests. --------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the int64 and uint64 types to avoid link-time errors when compiling
+// the crosstests in LP64.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SUBZERO_CROSSTEST_XDEFS_H_
+#define SUBZERO_CROSSTEST_XDEFS_H_
+typedef unsigned int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+typedef unsigned int SizeT;
+#ifdef X8664_STACK_HACK
+// the X86_STACK_HACK is an intrusive way of getting the crosstests to run in
+// x86_64 LP64 even with an ILP32 model. This hack allocates a new stack for
+// running the tests in the low 4GB of the address space.
+#ifdef __cplusplus
+#define XTEST_EXTERN extern "C"
+#else // !defined(__cplusplus)
+#define XTEST_EXTERN extern
+#endif // __cplusplus
+/// xAllocStack allocates the memory chunk [StackEnd - Size - 1, StackEnd). It
+/// requires StackEnd to be less than 32-bits long. Conversely, xDeallocStack
+/// frees that memory chunk.
+/// {@
+XTEST_EXTERN unsigned char *xAllocStack(uint64 StackEnd, uint32 Size);
+XTEST_EXTERN void xDeallocStack(uint64 StackEnd, uint32 Size);
+/// @}
+// wrapped_main is invoked by the x86-64 stack hack main. We declare a prototype
+// so the compiler (and not the linker) can yell if a test's wrapped_main
+// prototype does not match what we want.
+XTEST_EXTERN int wrapped_main(int argc, char *argv[]);
+#undef XTEST_EXTERN
+#endif // X8664_STACK_HACK
+#endif // SUBZERO_CROSSTEST_XDEFS_H_
--- a/pydir/build-runtime.py
+++ b/pydir/build-runtime.py
@@ -124,6 +124,8 @@ def main():
        MakeRuntimesForTarget(targets.X8632Target, ll_files,
                              srcdir, tempdir, rtdir, args.verbose)
+        MakeRuntimesForTarget(targets.X8664Target, ll_files,
+                              srcdir, tempdir, rtdir, args.verbose)
        MakeRuntimesForTarget(targets.ARM32Target, ll_files,
                              srcdir, tempdir, rtdir, args.verbose)

--- a/pydir/crosstest.py
+++ b/pydir/crosstest.py
@@ -177,6 +177,18 @@ def main():
            'szrt_{sb}_' + args.target + '.o'
            ).format(root=nacl_root, sb='sb' if args.sandbox else 'native'))
    pure_c = os.path.splitext(args.driver)[1] == '.c'
+    # TargetX8664 is ilp32, but clang does not currently support such
+    # configuration. In order to run the crosstests we play nasty, dangerous
+    # tricks with the stack pointer.
+    needs_stack_hack = (args.target == 'x8664')
+    stack_hack_params = []
+    if needs_stack_hack:
+      shellcmd('{bin}/clang -g -o stack_hack.x8664.{key}.o -c '
+               'stack_hack.x8664.c'.format(bin=bindir, key=key))
+      stack_hack_params.append('-DX8664_STACK_HACK')
+      stack_hack_params.append('stack_hack.x8664.{key}.o'.format(key=key))
    # Set compiler to clang, clang++, pnacl-clang, or pnacl-clang++.
    compiler = '{bin}/{prefix}{cc}'.format(
        bin=bindir, prefix='pnacl-' if args.sandbox else '',
@@ -189,7 +201,7 @@ def main():
                       '-lm', '-lpthread',
                       '-Wl,--defsym=__Sz_AbsoluteZero=0'] +
                      target_info.cross_headers)
-    shellcmd([compiler, args.driver] + objs +
+    shellcmd([compiler] + stack_hack_params + [args.driver] + objs +
             ['-o', os.path.join(args.dir, args.output)] + sb_native_args)
 if __name__ == '__main__':

--- a/pydir/crosstest_generator.py
+++ b/pydir/crosstest_generator.py
@@ -55,15 +55,17 @@ def main():
  root = FindBaseNaCl()
  # The rest of the attribute sets.
-  targets = [ 'x8632', 'arm32' ]
+  targets = [ 'x8632', 'x8664', 'arm32' ]
  sandboxing = [ 'native', 'sandbox' ]
  opt_levels = [ 'Om1', 'O2' ]
  arch_attrs = { 'x8632': [ 'sse2', 'sse4.1' ],
+                 'x8664': [ 'sse2', 'sse4.1' ],
                 'arm32': [ 'neon', 'hwdiv-arm' ] }
  flat_attrs = []
  for v in arch_attrs.values():
    flat_attrs += v
  arch_flags = { 'x8632': [],
+                 'x8664': [],
                 # ARM doesn't have an integrated assembler yet.
                 'arm32': ['--filetype=asm'] }
  # all_keys is only used in the help text.

--- a/pydir/targets.py
+++ b/pydir/targets.py
@@ -40,6 +40,5 @@ ARM32Target = TargetInfo(target='arm32',
                         ld_emu='armelf_nacl',
                         cross_headers=['-isystem', FindARMCrossInclude()])
 def ConvertTripleToNaCl(nonsfi_triple):
  return nonsfi_triple.replace('linux', 'nacl')
--- a/runtime/szrt_profiler.c
+++ b/runtime/szrt_profiler.c
+#include <inttypes.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -53,7 +54,7 @@ void __Sz_profile_summary() {
  printf("%s", SubzeroLogo);
  for (const struct BlockProfileInfo **curr = &__Sz_block_profile_info;
       *curr != NULL; ++curr) {
-    printf("%lld\t%s\n", (*curr)->Counter, (*curr)->BlockName);
+    printf("%" PRIu64 "\t%s\n", (*curr)->Counter, (*curr)->BlockName);
  }
  fflush(stdout);
 }
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -243,9 +243,9 @@ public:
  // Cross Xmm/GPR cast instructions.
  template <typename DReg_t, typename SReg_t> struct CastEmitterRegOp {
-    typedef void (AssemblerX86Base::*TypedEmitRegs)(Type, DReg_t, SReg_t);
+    typedef void (AssemblerX86Base::*TypedEmitRegs)(Type, DReg_t, Type, SReg_t);
    typedef void (AssemblerX86Base::*TypedEmitAddr)(
-        Type, DReg_t, const typename Traits::Address &);
+        Type, DReg_t, Type, const typename Traits::Address &);
    TypedEmitRegs RegReg;
    TypedEmitAddr RegAddr;
@@ -299,7 +299,14 @@ public:
           typename Traits::GPRRegister src);
  void mov(Type Ty, const typename Traits::Address &dst, const Immediate &imm);
-  void movFromAh(const typename Traits::GPRRegister dst);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type
+  movabs(const typename Traits::GPRRegister Dst, uint64_t Imm64);
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type
+  movabs(const typename Traits::GPRRegister, uint64_t) {
+    llvm::report_fatal_error("movabs is only supported in 64-bit x86 targets.");
+  }
  void movzx(Type Ty, typename Traits::GPRRegister dst,
             typename Traits::GPRRegister src);
@@ -328,11 +335,13 @@ public:
  void movss(Type Ty, typename Traits::XmmRegister dst,
             typename Traits::XmmRegister src);
-  void movd(typename Traits::XmmRegister dst, typename Traits::GPRRegister src);
+  void movd(Type SrcTy, typename Traits::XmmRegister dst,
-  void movd(typename Traits::XmmRegister dst,
+            typename Traits::GPRRegister src);
+  void movd(Type SrcTy, typename Traits::XmmRegister dst,
            const typename Traits::Address &src);
-  void movd(typename Traits::GPRRegister dst, typename Traits::XmmRegister src);
+  void movd(Type DestTy, typename Traits::GPRRegister dst,
-  void movd(const typename Traits::Address &dst,
+            typename Traits::XmmRegister src);
+  void movd(Type DestTy, const typename Traits::Address &dst,
            typename Traits::XmmRegister src);
  void movq(typename Traits::XmmRegister dst, typename Traits::XmmRegister src);
@@ -504,9 +513,9 @@ public:
  void cvttps2dq(Type, typename Traits::XmmRegister dst,
                 const typename Traits::Address &src);
-  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst,
+  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst, Type SrcTy,
                typename Traits::GPRRegister src);
-  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst,
+  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst, Type SrcTy,
                const typename Traits::Address &src);
  void cvtfloat2float(Type SrcTy, typename Traits::XmmRegister dst,
@@ -514,9 +523,9 @@ public:
  void cvtfloat2float(Type SrcTy, typename Traits::XmmRegister dst,
                      const typename Traits::Address &src);
-  void cvttss2si(Type SrcTy, typename Traits::GPRRegister dst,
+  void cvttss2si(Type DestTy, typename Traits::GPRRegister dst, Type SrcTy,
                 typename Traits::XmmRegister src);
-  void cvttss2si(Type SrcTy, typename Traits::GPRRegister dst,
+  void cvttss2si(Type DestTy, typename Traits::GPRRegister dst, Type SrcTy,
                 const typename Traits::Address &src);
  void ucomiss(Type Ty, typename Traits::XmmRegister a,
@@ -719,6 +728,12 @@ public:
  void cbw();
  void cwd();
  void cdq();
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type cqo();
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type cqo() {
+    llvm::report_fatal_error("CQO is only available in 64-bit x86 backends.");
+  }
  void div(Type Ty, typename Traits::GPRRegister reg);
  void div(Type Ty, const typename Traits::Address &address);
@@ -936,7 +951,7 @@ private:
                     typename Traits::GPRRegister>::value;
    return IsGPR && (Reg & 0x04) != 0 && (Reg & 0x08) == 0 &&
-           isByteSizedArithType(Ty);
+           isByteSizedType(Ty);
  };
  // assembleAndEmitRex is used for determining which (if any) rex prefix should

--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
--- a/src/IceELFSection.h
+++ b/src/IceELFSection.h
@@ -362,8 +362,7 @@ void ELFRelocationSection::writeData(const GlobalContext &Ctx, ELFStreamer &Str,
      llvm::report_fatal_error("Missing symbol mentioned in reloc");
    if (IsELF64) {
-      llvm_unreachable(
+      // TODO(jpp): check that Fixup.offset() is correct even for pc-rel.
-          "Not tested -- check that Fixup.offset() is correct even for pc-rel");
      Elf64_Rela Rela;
      Rela.r_offset = Fixup.position();
      Rela.setSymbolAndType(Symbol->getNumber(), Fixup.kind());

--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -206,7 +206,7 @@ MachineTraits<TargetX8632>::X86OperandMem::toAsmAddress(
    } else if (const auto CR =
                   llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
      Disp = CR->getOffset();
-      Fixup = Asm->createFixup(llvm::ELF::R_386_32, CR);
+      Fixup = Asm->createFixup(RelFixup, CR);
    } else {
      llvm_unreachable("Unexpected offset type");
    }

--- a/src/IceInstX8664.cpp
+++ b/src/IceInstX8664.cpp
@@ -179,8 +179,8 @@ MachineTraits<TargetX8664>::X86OperandMem::toAsmAddress(
      Disp = static_cast<int32_t>(CI->getValue());
    } else if (const auto CR =
                   llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
-      Disp = CR->getOffset();
+      Disp = CR->getOffset() - 4;
-      Fixup = Asm->createFixup(llvm::ELF::R_386_32, CR);
+      Fixup = Asm->createFixup(PcRelFixup, CR);
    } else {
      llvm_unreachable("Unexpected offset type");
    }

--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -1100,6 +1100,8 @@ class InstX86Movsx
    : public InstX86BaseUnaryopGPR<Machine, InstX86Base<Machine>::Movsx> {
 public:
  static InstX86Movsx *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    assert(typeWidthInBytes(Dest->getType()) >
+           typeWidthInBytes(Src->getType()));
    return new (Func->allocate<InstX86Movsx>()) InstX86Movsx(Func, Dest, Src);
  }
@@ -1116,6 +1118,8 @@ class InstX86Movzx
    : public InstX86BaseUnaryopGPR<Machine, InstX86Base<Machine>::Movzx> {
 public:
  static InstX86Movzx *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    assert(typeWidthInBytes(Dest->getType()) >
+           typeWidthInBytes(Src->getType()));
    return new (Func->allocate<InstX86Movzx>()) InstX86Movzx(Func, Dest, Src);
  }

--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -792,7 +792,7 @@ void TargetDataX8632::lowerJumpTables() {
  case FT_Elf: {
    ELFObjectWriter *Writer = Ctx->getObjectWriter();
    for (const JumpTableData &JT : Ctx->getJumpTables())
-      Writer->writeJumpTable(JT, llvm::ELF::R_386_32);
+      Writer->writeJumpTable(JT, TargetX8632::Traits::RelFixup);
  } break;
  case FT_Asm:
    // Already emitted from Cfg
@@ -821,7 +821,8 @@ void TargetDataX8632::lowerGlobals(const VariableDeclarationList &Vars,
  switch (Ctx->getFlags().getOutFileType()) {
  case FT_Elf: {
    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    Writer->writeDataSection(Vars, llvm::ELF::R_386_32, SectionSuffix);
+    Writer->writeDataSection(Vars, TargetX8632::Traits::RelFixup,
+                             SectionSuffix);
  } break;
  case FT_Asm:
  case FT_Iasm: {

--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -68,6 +68,7 @@ template <> struct MachineTraits<TargetX8632> {
  static const GPRRegister Encoded_Reg_Accumulator = RegX8632::Encoded_Reg_eax;
  static const GPRRegister Encoded_Reg_Counter = RegX8632::Encoded_Reg_ecx;
  static const FixupKind PcRelFixup = llvm::ELF::R_386_PC32;
+  static const FixupKind RelFixup = llvm::ELF::R_386_32;
  class Operand {
  public:
@@ -272,6 +273,7 @@ template <> struct MachineTraits<TargetX8632> {
  };
  static const char *TargetName;
+  static constexpr Type WordType = IceType_i32;
  static IceString getRegName(SizeT RegNum, Type Ty) {
    assert(RegNum < RegisterSet::Reg_NUM);

--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -123,7 +123,7 @@ getRegisterForGprArgNum(uint32_t ArgNum) {
 }
 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
-// OperandList in lowerCall. std::max() was supposed to work, but it doesn't.
+// OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
 constexpr SizeT constexprMax(SizeT S0, SizeT S1) { return S0 < S1 ? S1 : S0; }
 } // end of anonymous namespace
@@ -239,7 +239,6 @@ void TargetX8664::lowerCall(const InstCall *Instr) {
  Variable *Dest = Instr->getDest();
  // ReturnReg doubles as ReturnRegLo as necessary.
  Variable *ReturnReg = nullptr;
-  Variable *ReturnRegHi = nullptr;
  if (Dest) {
    switch (Dest->getType()) {
    case IceType_NUM:
@@ -250,12 +249,8 @@ void TargetX8664::lowerCall(const InstCall *Instr) {
    case IceType_i8:
    case IceType_i16:
    case IceType_i32:
-      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_eax);
-      break;
    case IceType_i64:
-      // TODO(jpp): return i64 in a GPR.
+      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_eax);
-      ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-      ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
      break;
    case IceType_f32:
    case IceType_f64:
@@ -271,27 +266,16 @@ void TargetX8664::lowerCall(const InstCall *Instr) {
    }
  }
-  Operand *CallTarget = legalize(Instr->getCallTarget());
+  Operand *CallTarget = legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm);
  const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
  if (NeedSandboxing) {
-    if (llvm::isa<Constant>(CallTarget)) {
+    llvm_unreachable("X86-64 Sandboxing codegen not implemented.");
-      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
-    } else {
-      Variable *CallTargetVar = nullptr;
-      _mov(CallTargetVar, CallTarget);
-      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
-      const SizeT BundleSize =
-          1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
-      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
-      CallTarget = CallTargetVar;
-    }
  }
  Inst *NewCall = Traits::Insts::Call::create(Func, ReturnReg, CallTarget);
  Context.insert(NewCall);
-  if (NeedSandboxing)
+  if (NeedSandboxing) {
-    _bundle_unlock();
+    llvm_unreachable("X86-64 Sandboxing codegen not implemented.");
-  if (ReturnRegHi)
+  }
-    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
  // Add the appropriate offset to esp.  The call instruction takes care
  // of resetting the stack offset during emission.
@@ -315,25 +299,11 @@ void TargetX8664::lowerCall(const InstCall *Instr) {
  assert(ReturnReg && "x86-64 always returns value on registers.");
-  // Assign the result of the call to Dest.
+  if (isVectorType(Dest->getType())) {
-  if (ReturnRegHi) {
-    assert(Dest->getType() == IceType_i64);
-    split64(Dest);
-    Variable *DestLo = Dest->getLo();
-    Variable *DestHi = Dest->getHi();
-    _mov(DestLo, ReturnReg);
-    _mov(DestHi, ReturnRegHi);
-    return;
-  }
-  assert(Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64 ||
-         Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
-         Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
-         isVectorType(Dest->getType()));
-  if (isScalarFloatingType(Dest->getType()) || isVectorType(Dest->getType())) {
    _movp(Dest, ReturnReg);
  } else {
+    assert(isScalarFloatingType(Dest->getType()) ||
+           isScalarIntegerType(Dest->getType()));
    _mov(Dest, ReturnReg);
  }
 }
@@ -356,36 +326,36 @@ void TargetX8664::lowerArguments() {
       ++i) {
    Variable *Arg = Args[i];
    Type Ty = Arg->getType();
-    if ((isVectorType(Ty) || isScalarFloatingType(Ty)) &&
+    Variable *RegisterArg = nullptr;
-        NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
+    int32_t RegNum = Variable::NoRegister;
-      // Replace Arg in the argument list with the home register.  Then
+    if ((isVectorType(Ty) || isScalarFloatingType(Ty))) {
-      // generate an instruction in the prolog to copy the home register
+      if (NumXmmArgs >= Traits::X86_MAX_XMM_ARGS) {
-      // to the assigned location of Arg.
+        continue;
-      int32_t RegNum = getRegisterForXmmArgNum(NumXmmArgs);
+      }
+      RegNum = getRegisterForXmmArgNum(NumXmmArgs);
      ++NumXmmArgs;
-      Variable *RegisterArg = Func->makeVariable(Ty);
+      RegisterArg = Func->makeVariable(Ty);
-      if (BuildDefs::dump())
+    } else if (isScalarIntegerType(Ty)) {
-        RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
+      if (NumGprArgs >= Traits::X86_MAX_GPR_ARGS) {
-      RegisterArg->setRegNum(RegNum);
+        continue;
-      RegisterArg->setIsArg();
+      }
-      Arg->setIsArg(false);
+      RegNum = getRegisterForGprArgNum(NumGprArgs);
-      Args[i] = RegisterArg;
-      Context.insert(InstAssign::create(Func, Arg, RegisterArg));
-    } else if (isScalarIntegerType(Ty) &&
-               NumGprArgs < Traits::X86_MAX_GPR_ARGS) {
-      int32_t RegNum = getRegisterForGprArgNum(NumGprArgs);
      ++NumGprArgs;
-      Variable *RegisterArg = Func->makeVariable(Ty);
+      RegisterArg = Func->makeVariable(Ty);
-      if (BuildDefs::dump())
-        RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
-      RegisterArg->setRegNum(RegNum);
-      RegisterArg->setIsArg();
-      Arg->setIsArg(false);
-      Args[i] = RegisterArg;
-      Context.insert(InstAssign::create(Func, Arg, RegisterArg));
    }
+    assert(RegNum != Variable::NoRegister);
+    assert(RegisterArg != nullptr);
+    // Replace Arg in the argument list with the home register.  Then
+    // generate an instruction in the prolog to copy the home register
+    // to the assigned location of Arg.
+    if (BuildDefs::dump())
+      RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
+    RegisterArg->setRegNum(RegNum);
+    RegisterArg->setIsArg();
+    Arg->setIsArg(false);
+    Args[i] = RegisterArg;
+    Context.insert(InstAssign::create(Func, Arg, RegisterArg));
  }
 }
@@ -393,19 +363,11 @@ void TargetX8664::lowerRet(const InstRet *Inst) {
  Variable *Reg = nullptr;
  if (Inst->hasRetValue()) {
    Operand *Src0 = legalize(Inst->getRetValue());
-    // TODO(jpp): this is not needed.
+    if (isVectorType(Src0->getType()) ||
-    if (Src0->getType() == IceType_i64) {
+        isScalarFloatingType(Src0->getType())) {
-      Variable *eax =
-          legalizeToReg(loOperand(Src0), Traits::RegisterSet::Reg_eax);
-      Variable *edx =
-          legalizeToReg(hiOperand(Src0), Traits::RegisterSet::Reg_edx);
-      Reg = eax;
-      Context.insert(InstFakeUse::create(Func, edx));
-    } else if (isScalarFloatingType(Src0->getType())) {
-      _fld(Src0);
-    } else if (isVectorType(Src0->getType())) {
      Reg = legalizeToReg(Src0, Traits::RegisterSet::Reg_xmm0);
    } else {
+      assert(isScalarIntegerType(Src0->getType()));
      _mov(Reg, Src0, Traits::RegisterSet::Reg_eax);
    }
  }
@@ -577,19 +539,17 @@ void TargetX8664::addProlog(CfgNode *Node) {
  unsigned NumGPRArgs = 0;
  for (Variable *Arg : Args) {
    // Skip arguments passed in registers.
-    if (isVectorType(Arg->getType()) && NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
+    if (isVectorType(Arg->getType()) || isScalarFloatingType(Arg->getType())) {
-      ++NumXmmArgs;
+      if (NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
-      continue;
+        ++NumXmmArgs;
-    }
+        continue;
-    if (isScalarFloatingType(Arg->getType()) &&
+      }
-        NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
+    } else {
-      ++NumXmmArgs;
+      assert(isScalarIntegerType(Arg->getType()));
-      continue;
+      if (NumGPRArgs < Traits::X86_MAX_GPR_ARGS) {
-    }
+        ++NumGPRArgs;
-    if (isScalarIntegerType(Arg->getType()) &&
+        continue;
-        NumGPRArgs < Traits::X86_MAX_GPR_ARGS) {
+      }
-      ++NumGPRArgs;
-      continue;
    }
    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
  }
@@ -679,23 +639,9 @@ void TargetX8664::addEpilog(CfgNode *Node) {
    }
  }
-  if (!Ctx->getFlags().getUseSandboxing())
+  if (Ctx->getFlags().getUseSandboxing()) {
-    return;
+    llvm_unreachable("X86-64 Sandboxing codegen not implemented.");
-  // Change the original ret instruction into a sandboxed return sequence.
-  // t:ecx = pop
-  // bundle_lock
-  // and t, ~31
-  // jmp *t
-  // bundle_unlock
-  // FakeUse <original_ret_operand>
-  Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
-  _pop(T_ecx);
-  lowerIndirectJump(T_ecx);
-  if (RI->getSrcSize()) {
-    Variable *RetValue = llvm::cast<Variable>(RI->getSrc(0));
-    Context.insert(InstFakeUse::create(Func, RetValue));
  }
-  RI->setDeleted();
 }
 void TargetX8664::emitJumpTable(const Cfg *Func,
@@ -858,8 +804,7 @@ void TargetDataX8664::lowerJumpTables() {
  case FT_Elf: {
    ELFObjectWriter *Writer = Ctx->getObjectWriter();
    for (const JumpTableData &JumpTable : Ctx->getJumpTables())
-      // TODO(jpp): not 386.
+      Writer->writeJumpTable(JumpTable, TargetX8664::Traits::RelFixup);
-      Writer->writeJumpTable(JumpTable, llvm::ELF::R_386_32);
  } break;
  case FT_Asm:
    // Already emitted from Cfg
@@ -888,8 +833,8 @@ void TargetDataX8664::lowerGlobals(const VariableDeclarationList &Vars,
  switch (Ctx->getFlags().getOutFileType()) {
  case FT_Elf: {
    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    // TODO(jpp): not 386.
+    Writer->writeDataSection(Vars, TargetX8664::Traits::RelFixup,
-    Writer->writeDataSection(Vars, llvm::ELF::R_386_32, SectionSuffix);
+                             SectionSuffix);
  } break;
  case FT_Asm:
  case FT_Iasm: {

--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -66,7 +66,8 @@ template <> struct MachineTraits<TargetX8664> {
  using RegisterSet = ::Ice::RegX8664;
  static const GPRRegister Encoded_Reg_Accumulator = RegX8664::Encoded_Reg_eax;
  static const GPRRegister Encoded_Reg_Counter = RegX8664::Encoded_Reg_ecx;
-  static const FixupKind PcRelFixup = llvm::ELF::R_386_PC32; // TODO(jpp): ???
+  static const FixupKind PcRelFixup = llvm::ELF::R_X86_64_PC32;
+  static const FixupKind RelFixup = llvm::ELF::R_X86_64_32S;
  class Operand {
  public:
@@ -270,8 +271,8 @@ template <> struct MachineTraits<TargetX8664> {
    static Address ofConstPool(Assembler *Asm, const Constant *Imm) {
      // TODO(jpp): ???
-      AssemblerFixup *Fixup = Asm->createFixup(llvm::ELF::R_386_32, Imm);
+      AssemblerFixup *Fixup = Asm->createFixup(RelFixup, Imm);
-      const RelocOffsetT Offset = 0;
+      const RelocOffsetT Offset = 4;
      return Address(ABSOLUTE, Offset, Fixup);
    }
  };
@@ -293,6 +294,7 @@ template <> struct MachineTraits<TargetX8664> {
  };
  static const char *TargetName;
+  static constexpr Type WordType = IceType_i64;
  static IceString getRegName(SizeT RegNum, Type Ty) {
    assert(RegNum < RegisterSet::Reg_NUM);
@@ -331,7 +333,7 @@ template <> struct MachineTraits<TargetX8664> {
 #define X(val, encode, name64, name32, name16, name8, scratch, preserved,      \
          stackptr, frameptr, isInt, isFP)                                     \
  (*IntegerRegisters)[RegisterSet::val] = isInt;                               \
-  (*IntegerRegistersI8)[RegisterSet::val] = 1;                                 \
+  (*IntegerRegistersI8)[RegisterSet::val] = isInt;                             \
  (*FloatRegisters)[RegisterSet::val] = isFP;                                  \
  (*VectorRegisters)[RegisterSet::val] = isFP;                                 \
  (*ScratchRegs)[RegisterSet::val] = scratch;
@@ -450,7 +452,7 @@ template <> struct MachineTraits<TargetX8664> {
  /// address.
  static const uint32_t X86_STACK_ALIGNMENT_BYTES;
  /// Size of the return address on the stack
-  static const uint32_t X86_RET_IP_SIZE_BYTES = 4;
+  static const uint32_t X86_RET_IP_SIZE_BYTES = 8;
  /// The number of different NOP instructions
  static const uint32_t X86_NUM_NOP_VARIANTS = 5;

--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -21,6 +21,7 @@
 #include "IceInst.h"
 #include "IceSwitchLowering.h"
 #include "IceTargetLowering.h"
+#include "IceUtils.h"
 #include <type_traits>
 #include <utility>
@@ -80,10 +81,9 @@ public:
                           : Traits::RegisterSet::Reg_esp;
  }
  size_t typeWidthInBytesOnStack(Type Ty) const override {
-    // Round up to the next multiple of 4 bytes.  In particular, i1,
+    // Round up to the next multiple of WordType bytes.
-    // i8, and i16 are rounded up to 4 bytes.
+    const uint32_t WordSizeInBytes = typeWidthInBytes(Traits::WordType);
-    // TODO(jpp): this needs to round to multiples of 8 bytes in x86-64.
+    return Utils::applyAlignment(typeWidthInBytes(Ty), WordSizeInBytes);
-    return (typeWidthInBytes(Ty) + 3) & ~3;
  }
  SizeT getMinJumpTableSize() const override { return 4; }
@@ -98,14 +98,40 @@ public:
  void emit(const ConstantDouble *C) const final;
  void initNodeForLowering(CfgNode *Node) override;
-  /// Ensure that a 64-bit Variable has been split into 2 32-bit
+  /// x86-32: Ensure that a 64-bit Variable has been split into 2 32-bit
  /// Variables, creating them if necessary.  This is needed for all
  /// I64 operations, and it is needed for pushing F64 arguments for
  /// function calls using the 32-bit push instruction (though the
  /// latter could be done by directly writing to the stack).
-  void split64(Variable *Var);
+  ///
-  Operand *loOperand(Operand *Operand);
+  /// x86-64: Complains loudly if invoked because the cpu can handle
-  Operand *hiOperand(Operand *Operand);
+  /// 64-bit types natively.
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type split64(Variable *Var);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type split64(Variable *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (split64)");
+  }
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, Operand>::type *
+  loOperand(Operand *Operand);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, Operand>::type *loOperand(Operand *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (loOperand)");
+  }
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, Operand>::type *
+  hiOperand(Operand *Operand);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, Operand>::type *hiOperand(Operand *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (hiOperand)");
+  }
  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);
  typename Traits::Address stackVarToAsmOperand(const Variable *Var) const;
@@ -128,6 +154,19 @@ protected:
  void lowerExtractElement(const InstExtractElement *Inst) override;
  void lowerFcmp(const InstFcmp *Inst) override;
  void lowerIcmp(const InstIcmp *Inst) override;
+  /// Complains loudly if invoked because the cpu can handle 64-bit types
+  /// natively.
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type
+  lowerIcmp64(const InstIcmp *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (lowerIcmp64)");
+  }
+  /// x86lowerIcmp64 handles 64-bit icmp lowering.
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type
+  lowerIcmp64(const InstIcmp *Inst);
  void lowerIntrinsicCall(const InstIntrinsicCall *Inst) override;
  void lowerInsertElement(const InstInsertElement *Inst) override;
  void lowerLoad(const InstLoad *Inst) override;

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
--- a/unittest/AssemblerX8632/DataMov.cpp
+++ b/unittest/AssemblerX8632/DataMov.cpp
@@ -538,7 +538,8 @@ TEST_F(AssemblerX8632Test, MovdToXmm) {
                                                                               \
    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src, Immediate(Value));     \
    __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));   \
-    __ movd(XmmRegister::Encoded_Reg_##Dst, GPRRegister::Encoded_Reg_##Src);   \
+    __ movd(IceType_i32, XmmRegister::Encoded_Reg_##Dst,                       \
+            GPRRegister::Encoded_Reg_##Src);                                   \
                                                                               \
    AssembledTest test = assemble();                                           \
                                                                               \
@@ -560,7 +561,7 @@ TEST_F(AssemblerX8632Test, MovdToXmm) {
    const uint64_t V1 = 0xFFFFFFFF00000000ull;                                 \
                                                                               \
    __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
-    __ movd(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));                 \
+    __ movd(IceType_i32, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));    \
                                                                               \
    AssembledTest test = assemble();                                           \
                                                                               \
@@ -609,7 +610,8 @@ TEST_F(AssemblerX8632Test, MovdFromXmm) {
    const uint32_t V0 = Value;                                                 \
                                                                               \
    __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));   \
-    __ movd(GPRRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src);   \
+    __ movd(IceType_i32, GPRRegister::Encoded_Reg_##Dst,                       \
+            XmmRegister::Encoded_Reg_##Src);                                   \
                                                                               \
    AssembledTest test = assemble();                                           \
                                                                               \
@@ -631,7 +633,7 @@ TEST_F(AssemblerX8632Test, MovdFromXmm) {
    const uint32_t V1 = ~(Value);                                              \
                                                                               \
    __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));   \
-    __ movd(dwordAddress(T1), XmmRegister::Encoded_Reg_##Src);                 \
+    __ movd(IceType_i32, dwordAddress(T1), XmmRegister::Encoded_Reg_##Src);    \
                                                                               \
    AssembledTest test = assemble();                                           \
                                                                               \

--- a/unittest/AssemblerX8632/XmmArith.cpp
+++ b/unittest/AssemblerX8632/XmmArith.cpp
@@ -1072,7 +1072,7 @@ TEST_F(AssemblerX8632Test, Cvt) {
    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##GPR,                        \
           Immediate(Inst##Size##SrcValue));                                   \
-    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,              \
+    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst, IceType_i32, \
                 GPRRegister::Encoded_Reg_##GPR);                              \
                                                                               \
    AssembledTest test = assemble();                                           \
@@ -1092,7 +1092,7 @@ TEST_F(AssemblerX8632Test, Cvt) {
    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##GPR,                        \
           Immediate(Inst##Size##DstValue));                                   \
    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));               \
-    __ cvt##Inst(IceType_f##Size, GPRRegister::Encoded_Reg_##GPR,              \
+    __ cvt##Inst(IceType_i32, GPRRegister::Encoded_Reg_##GPR, IceType_f##Size, \
                 XmmRegister::Encoded_Reg_##Src);                              \
                                                                               \
    AssembledTest test = assemble();                                           \
@@ -1132,7 +1132,7 @@ TEST_F(AssemblerX8632Test, Cvt) {
    const uint32_t T1 = allocateDword();                                       \
                                                                               \
    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
-    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,              \
+    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst, IceType_i32, \
                 dwordAddress(T1));                                            \
                                                                               \
    AssembledTest test = assemble();                                           \
@@ -1152,7 +1152,7 @@ TEST_F(AssemblerX8632Test, Cvt) {
                                                                               \
    __ mov(IceType_i32, GPRRegister::Encoded_Reg_##GPR,                        \
           Immediate(Inst##Size##DstValue));                                   \
-    __ cvt##Inst(IceType_f##Size, GPRRegister::Encoded_Reg_##GPR,              \
+    __ cvt##Inst(IceType_i32, GPRRegister::Encoded_Reg_##GPR, IceType_f##Size, \
                 dwordAddress(T0));                                            \
                                                                               \
    AssembledTest test = assemble();                                           \

--- a/unittest/AssemblerX8664/DataMov.cpp
+++ b/unittest/AssemblerX8664/DataMov.cpp
--- a/unittest/AssemblerX8664/XmmArith.cpp
+++ b/unittest/AssemblerX8664/XmmArith.cpp
@@ -1104,15 +1104,16 @@ TEST_F(AssemblerX8664Test, Cvt) {
    reset();                                                                   \
  } while (0)
-#define TestImplSXmmReg(Dst, GPR, Inst, Size)                                  \
+#define TestImplSXmmReg(Dst, GPR, Inst, Size, IntType)                         \
  do {                                                                         \
    static constexpr char TestString[] =                                       \
-        "(" #Dst ", " #GPR ", cvt" #Inst ", f" #Size ")";                      \
+        "(" #Dst ", " #GPR ", cvt" #Inst ", " #IntType ", f" #Size ")";        \
    const uint32_t T0 = allocateDqword();                                      \
                                                                               \
    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
    __ mov(IceType_i32, Encoded_GPR_##GPR(), Immediate(Inst##Size##SrcValue)); \
-    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), Encoded_GPR_##GPR());   \
+    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), IntType,                \
+                 Encoded_GPR_##GPR());                                         \
                                                                               \
    AssembledTest test = assemble();                                           \
    test.setDqwordTo(T0, Inst##Size##DstValue);                                \
@@ -1122,21 +1123,23 @@ TEST_F(AssemblerX8664Test, Cvt) {
    reset();                                                                   \
  } while (0)
-#define TestImplSRegXmm(GPR, Src, Inst, Size)                                  \
+#define TestImplSRegXmm(GPR, Src, Inst, IntSize, Size)                         \
  do {                                                                         \
    static constexpr char TestString[] =                                       \
-        "(" #GPR ", " #Src ", cvt" #Inst ", f" #Size ")";                      \
+        "(" #GPR ", " #Src ", cvt" #Inst ", " #IntSize ", f" #Size ")";        \
    const uint32_t T0 = allocateDqword();                                      \
                                                                               \
    __ mov(IceType_i32, Encoded_GPR_##GPR(), Immediate(Inst##Size##DstValue)); \
    __ movups(Encoded_Xmm_##Src(), dwordAddress(T0));                          \
-    __ cvt##Inst(IceType_f##Size, Encoded_GPR_##GPR(), Encoded_Xmm_##Src());   \
+    __ cvt##Inst(IceType_i##IntSize, Encoded_GPR_##GPR(), IceType_f##Size,     \
+                 Encoded_Xmm_##Src());                                         \
                                                                               \
    AssembledTest test = assemble();                                           \
    test.setDqwordTo(T0, Inst##Size##SrcValue);                                \
    test.run();                                                                \
                                                                               \
-    ASSERT_EQ(static_cast<uint32_t>(Inst##Size##Expected), test.GPR())         \
+    ASSERT_EQ(static_cast<uint##IntSize##_t>(Inst##Size##Expected),            \
+              test.GPR())                                                      \
        << TestString;                                                         \
    reset();                                                                   \
  } while (0)
@@ -1160,15 +1163,16 @@ TEST_F(AssemblerX8664Test, Cvt) {
    reset();                                                                   \
  } while (0)
-#define TestImplSXmmAddr(Dst, Inst, Size)                                      \
+#define TestImplSXmmAddr(Dst, Inst, Size, IntType)                             \
  do {                                                                         \
    static constexpr char TestString[] =                                       \
-        "(" #Dst ", Addr, cvt" #Inst ", f" #Size ")";                          \
+        "(" #Dst ", Addr, cvt" #Inst ", f" #Size ", " #IntType ")";            \
    const uint32_t T0 = allocateDqword();                                      \
    const uint32_t T1 = allocateDword();                                       \
                                                                               \
    __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
-    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), dwordAddress(T1));      \
+    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), IntType,                \
+                 dwordAddress(T1));                                            \
                                                                               \
    AssembledTest test = assemble();                                           \
    test.setDqwordTo(T0, Inst##Size##DstValue);                                \
@@ -1179,20 +1183,22 @@ TEST_F(AssemblerX8664Test, Cvt) {
    reset();                                                                   \
  } while (0)
-#define TestImplSRegAddr(GPR, Inst, Size)                                      \
+#define TestImplSRegAddr(GPR, Inst, IntSize, Size)                             \
  do {                                                                         \
    static constexpr char TestString[] =                                       \
-        "(" #GPR ", Addr, cvt" #Inst ", f" #Size ")";                          \
+        "(" #GPR ", Addr, cvt" #Inst ", f" #Size ", " #IntSize ")";            \
    const uint32_t T0 = allocateDqword();                                      \
                                                                               \
    __ mov(IceType_i32, Encoded_GPR_##GPR(), Immediate(Inst##Size##DstValue)); \
-    __ cvt##Inst(IceType_f##Size, Encoded_GPR_##GPR(), dwordAddress(T0));      \
+    __ cvt##Inst(IceType_i##IntSize, Encoded_GPR_##GPR(), IceType_f##Size,     \
+                 dwordAddress(T0));                                            \
                                                                               \
    AssembledTest test = assemble();                                           \
    test.setDqwordTo(T0, Inst##Size##SrcValue);                                \
    test.run();                                                                \
                                                                               \
-    ASSERT_EQ(static_cast<uint32_t>(Inst##Size##Expected), test.GPR())         \
+    ASSERT_EQ(static_cast<uint##IntSize##_t>(Inst##Size##Expected),            \
+              test.GPR())                                                      \
        << TestString;                                                         \
    reset();                                                                   \
  } while (0)
@@ -1203,10 +1209,14 @@ TEST_F(AssemblerX8664Test, Cvt) {
    TestImplPXmmAddr(Src, dq2ps, Size);                                        \
    TestImplPXmmXmm(Dst, Src, tps2dq, Size);                                   \
    TestImplPXmmAddr(Src, tps2dq, Size);                                       \
-    TestImplSXmmReg(Dst, GPR, si2ss, Size);                                    \
+    TestImplSXmmReg(Dst, GPR, si2ss, Size, IceType_i32);                       \
-    TestImplSXmmAddr(Dst, si2ss, Size);                                        \
+    TestImplSXmmReg(Dst, GPR, si2ss, Size, IceType_i64);                       \
-    TestImplSRegXmm(GPR, Src, tss2si, Size);                                   \
+    TestImplSXmmAddr(Dst, si2ss, Size, IceType_i32);                           \
-    TestImplSRegAddr(GPR, tss2si, Size);                                       \
+    TestImplSXmmAddr(Dst, si2ss, Size, IceType_i64);                           \
+    TestImplSRegXmm(GPR, Src, tss2si, 32, Size);                               \
+    TestImplSRegXmm(GPR, Src, tss2si, 64, Size);                               \
+    TestImplSRegAddr(GPR, tss2si, 32, Size);                                   \
+    TestImplSRegAddr(GPR, tss2si, 64, Size);                                   \
    TestImplPXmmXmm(Dst, Src, float2float, Size);                              \
    TestImplPXmmAddr(Src, float2float, Size);                                  \
  } while (0)