Subzero: Align the stack at the point of function calls.

Be compatible with the x86-32 calling convention by ensuring that the stack is aligned to 16 bytes at the point of the call instruction. Also ensure that vector arguments passed on the stack are 16 byte aligned. Also, make alloca instructions respect alignment. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/444443002

Subzero: Align the stack at the point of function calls.
105b7044 · Matt Wala · 8835b89b · 105b7044 · 105b7044 · 105b7044
Commit 105b7044 authored Aug 11, 2014 by Matt Wala
16 changed files
--- a/crosstest/crosstest.py
+++ b/crosstest/crosstest.py
@@ -134,7 +134,6 @@ if __name__ == '__main__':
            objs.append(bitcode)
    linker = 'clang' if os.path.splitext(args.driver)[1] == '.c' else 'clang++'
-    # TODO: Remove -mstackrealign after Subzero supports stack alignment.
+    shellcmd([os.path.join(llvm_bin_path, linker), '-g', '-m32', args.driver] +
-    shellcmd([os.path.join(llvm_bin_path, linker), '-g', '-m32',
+             objs +
-              '-mstackrealign', args.driver] + objs +
             ['-lm', '-lpthread', '-o', os.path.join(args.dir, args.output)])
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -54,6 +54,14 @@ for optlevel in ${OPTLEVELS} ; do
            --output=test_bitmanip_O${optlevel}_${attribute}
        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_calling_conv.cpp \
+            --driver=test_calling_conv_main.cpp \
+            --output=test_calling_conv_O${optlevel}_${attribute}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
            --prefix=Subzero_ \
            --target=x8632 \
            --dir="${OUTDIR}" \
@@ -137,6 +145,7 @@ for optlevel in ${OPTLEVELS} ; do
        "${OUTDIR}"/mem_intrin_O${optlevel}_${attribute}
        "${OUTDIR}"/test_arith_O${optlevel}_${attribute}
        "${OUTDIR}"/test_bitmanip_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_calling_conv_O${optlevel}_${attribute}
        "${OUTDIR}"/test_cast_O${optlevel}_${attribute}
        "${OUTDIR}"/test_fcmp_O${optlevel}_${attribute}
        "${OUTDIR}"/test_global_O${optlevel}_${attribute}

--- a/crosstest/test_calling_conv.cpp
+++ b/crosstest/test_calling_conv.cpp
+//===- subzero/crosstest/test_calling_conv.cpp - Implementation for tests -===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the test functions used to check that Subzero
+// generates code compatible with the calling convention used by
+// llc. "Caller" functions test the handling of out-args, and "callee"
+// functions test the handling of in-args.
+//
+//===----------------------------------------------------------------------===//
+#include <cstring>
+#include "test_calling_conv.h"
+#define CALL_AS_TYPE(Ty, Func) (reinterpret_cast<Ty *>(Func))
+void caller_i(void) {
+  int arg1 = 0x12345678;
+  CALL_AS_TYPE(callee_i_Ty, Callee)(arg1);
+}
+void caller_vvvvv(void) {
+  v4si32 arg1 = {0, 1, 2, 3};
+  v4si32 arg2 = {4, 5, 6, 7};
+  v4si32 arg3 = {8, 9, 10, 11};
+  v4si32 arg4 = {12, 13, 14, 15};
+  v4si32 arg5 = {16, 17, 18, 19};
+  CALL_AS_TYPE(callee_vvvvv_Ty, Callee)(arg1, arg2, arg3, arg4, arg5);
+}
+void caller_vlvlivfvdviv(void) {
+  v4f32 arg1 = {0, 1, 2, 3};
+  int64_t arg2 = 4;
+  v4f32 arg3 = {6, 7, 8, 9};
+  int64_t arg4 = 10;
+  int arg5 = 11;
+  v4f32 arg6 = {12, 13, 14, 15};
+  float arg7 = 16;
+  v4f32 arg8 = {17, 18, 19, 20};
+  double arg9 = 21;
+  v4f32 arg10 = {22, 23, 24, 25};
+  int arg11 = 26;
+  v4f32 arg12 = {27, 28, 29, 30};
+  CALL_AS_TYPE(callee_vlvlivfvdviv_Ty, Callee)(arg1, arg2, arg3, arg4, arg5,
+                                               arg6, arg7, arg8, arg9, arg10,
+                                               arg11, arg12);
+}
+#define HANDLE_ARG(ARGNUM)                                                     \
+  case ARGNUM:                                                                 \
+    memcpy(&Buf[0], &arg##ARGNUM, sizeof(arg##ARGNUM));                        \
+    break;
+void __attribute__((noinline)) callee_i(int arg1) {
+  switch (ArgNum) { HANDLE_ARG(1); }
+}
+void __attribute__((noinline))
+callee_vvvvv(v4si32 arg1, v4si32 arg2, v4si32 arg3, v4si32 arg4, v4si32 arg5) {
+  switch (ArgNum) {
+    HANDLE_ARG(1);
+    HANDLE_ARG(2);
+    HANDLE_ARG(3);
+    HANDLE_ARG(4);
+    HANDLE_ARG(5);
+  }
+}
+void __attribute__((noinline))
+callee_vlvlivfvdviv(v4f32 arg1, int64_t arg2, v4f32 arg3, int64_t arg4, int arg5,
+                    v4f32 arg6, float arg7, v4f32 arg8, double arg9, v4f32 arg10,
+                    int arg11, v4f32 arg12) {
+  switch (ArgNum) {
+    HANDLE_ARG(1);
+    HANDLE_ARG(2);
+    HANDLE_ARG(3);
+    HANDLE_ARG(4);
+    HANDLE_ARG(5);
+    HANDLE_ARG(6);
+    HANDLE_ARG(7);
+    HANDLE_ARG(8);
+    HANDLE_ARG(9);
+    HANDLE_ARG(10);
+    HANDLE_ARG(11);
+    HANDLE_ARG(12);
+  }
+}
--- a/crosstest/test_calling_conv.def
+++ b/crosstest/test_calling_conv.def
+//===- subzero/crosstest/test_calling_conv.def - testing macros -*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines macros for testing the calling convention.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TEST_CALLING_CONV_DEF
+#define TEST_CALLING_CONV_DEF
+#define STR(x) (#x)
+#define TEST_FUNC_TABLE                            \
+/* caller,             callee,           argc */   \
+X(caller_i,            callee_i,              1)   \
+X(caller_vvvvv,        callee_vvvvv,          5)   \
+X(caller_vlvlivfvdviv, callee_vlvlivfvdviv,  12)   \
+// #define X(caller, callee, argc)
+#endif // TEST_CALLING_CONV_DEF
--- a/crosstest/test_calling_conv.h
+++ b/crosstest/test_calling_conv.h
+//===- subzero/crosstest/test_calling_conv.h - Test prototypes --*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for crosstesting the calling
+// convention.
+//
+//===----------------------------------------------------------------------===//
+#include "test_calling_conv.def"
+#include "vectors.h"
+typedef void (*CalleePtrTy)();
+extern CalleePtrTy Callee;
+extern size_t ArgNum;
+extern char *Buf;
+void caller_i();
+void caller_alloca_i();
+typedef void callee_i_Ty(int);
+callee_i_Ty callee_i;
+callee_i_Ty callee_alloca_i;
+void caller_vvvvv();
+typedef void (callee_vvvvv_Ty)(v4si32, v4si32, v4si32, v4si32, v4si32);
+callee_vvvvv_Ty callee_vvvvv;
+void caller_vlvlivfvdviv();
+typedef void(callee_vlvlivfvdviv_Ty)(v4f32, int64_t, v4f32, int64_t, int, v4f32,
+                                    float, v4f32, double, v4f32, int, v4f32);
+callee_vlvlivfvdviv_Ty callee_vlvlivfvdviv;
--- a/crosstest/test_calling_conv_main.cpp
+++ b/crosstest/test_calling_conv_main.cpp
+//===- subzero/crosstest/test_calling_conv_main.cpp - Driver for tests ----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the driver for cross testing the compatibility of
+// calling conventions.
+//
+//===----------------------------------------------------------------------===//
+/* crosstest.py --test=test_calling_conv.cpp               \
+   --driver=test_calling_conv_main.cpp --prefix=Subzero_   \
+   --output=test_calling_conv */
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include "test_calling_conv.h"
+namespace Subzero_ {
+#include "test_calling_conv.h"
+}
+// The crosstest code consists of caller / callee function pairs.
+//
+// The caller function initializes a list of arguments and calls the
+// function located at Callee.
+//
+// The callee function writes the argument numbered ArgNum into the
+// location pointed to by Buf.
+//
+// testCaller() tests that caller functions, as compiled by Subzero and
+// llc, pass arguments to the callee in the same way.  The Caller() and
+// Subzero_Caller() functions both call the same callee (which has been
+// compiled by llc).  The result in the global buffer is compared to
+// check that it is the same value after the calls by both callers.
+//
+// testCallee() runs the same kind of test, except that the functions
+// Callee() and Subzero_Callee() are being tested to ensure that both
+// functions receive arguments from the caller in the same way.  The
+// caller is compiled by llc.
+size_t ArgNum, Subzero_ArgNum;
+CalleePtrTy Callee, Subzero_Callee;
+char *Buf, *Subzero_Buf;
+const static size_t BUF_SIZE = 16;
+std::string bufAsString(const char Buf[BUF_SIZE]) {
+  std::ostringstream OS;
+  for (size_t i = 0; i < BUF_SIZE; ++i) {
+    if (i > 0)
+      OS << " ";
+    OS << (unsigned) Buf[i];
+  }
+  return OS.str();
+}
+void testCaller(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  static struct {
+    const char *CallerName, *CalleeName;
+    size_t Args;
+    void (*Caller)(void);
+    void (*Subzero_Caller)(void);
+    CalleePtrTy Callee;
+  } Funcs[] = {
+#define X(caller, callee, argc)                                                \
+  {                                                                            \
+    STR(caller), STR(callee), argc, &caller, &Subzero_::caller,                \
+        reinterpret_cast<CalleePtrTy>(&callee),                                \
+  }                                                                            \
+  ,
+    TEST_FUNC_TABLE
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    char BufLlc[BUF_SIZE], BufSz[BUF_SIZE];
+    Callee = Subzero_Callee = Funcs[f].Callee;
+    for (size_t i = 0; i < Funcs[f].Args; ++i) {
+      memset(BufLlc, 0xff, sizeof(BufLlc));
+      memset(BufSz, 0xff, sizeof(BufSz));
+      ArgNum = Subzero_ArgNum = i;
+      Buf = BufLlc;
+      Funcs[f].Caller();
+      Buf = BufSz;
+      Funcs[f].Subzero_Caller();
+      ++TotalTests;
+      if (!memcmp(BufLlc, BufSz, sizeof(BufLlc))) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "testCaller(Caller=" << Funcs[f].CallerName
+                  << ", Callee=" << Funcs[f].CalleeName << ", ArgNum=" << ArgNum
+                  << ")\nsz =" << bufAsString(BufSz)
+                  << "\nllc=" << bufAsString(BufLlc) << "\n";
+      }
+    }
+  }
+}
+void testCallee(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  static struct {
+    const char *CallerName, *CalleeName;
+    size_t Args;
+    void (*Caller)(void);
+    CalleePtrTy Callee, Subzero_Callee;
+  } Funcs[] = {
+#define X(caller, callee, argc)                                                \
+  {                                                                            \
+    STR(caller), STR(callee), argc, &caller,                                   \
+        reinterpret_cast<CalleePtrTy>(&callee),                                \
+        reinterpret_cast<CalleePtrTy>(&Subzero_::callee)                       \
+  }                                                                            \
+  ,
+    TEST_FUNC_TABLE
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    char BufLlc[BUF_SIZE], BufSz[BUF_SIZE];
+    Buf = BufLlc;
+    Subzero_Buf = BufSz;
+    for (size_t i = 0; i < Funcs[f].Args; ++i) {
+      memset(BufLlc, 0xff, sizeof(BufLlc));
+      memset(BufSz, 0xff, sizeof(BufSz));
+      ArgNum = Subzero_ArgNum = i;
+      Callee = Funcs[f].Callee;
+      Funcs[f].Caller();
+      Callee = Funcs[f].Subzero_Callee;
+      Funcs[f].Caller();
+      ++TotalTests;
+      if (!memcmp(BufLlc, BufSz, sizeof(BufLlc))) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "testCallee(Caller=" << Funcs[f].CallerName
+                  << ", Callee=" << Funcs[f].CalleeName << ", ArgNum=" << ArgNum
+                  << ")\nsz =" << bufAsString(BufSz)
+                  << "\nllc=" << bufAsString(BufLlc) << "\n";
+      }
+    }
+  }
+}
+int main(int argc, char *argv[]) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+  testCaller(TotalTests, Passes, Failures);
+  testCallee(TotalTests, Passes, Failures);
+  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+            << " Failures=" << Failures << "\n";
+  return Failures;
+}
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -92,6 +92,9 @@ OperandX8632Mem::OperandX8632Mem(Cfg *Func, Type Ty, Variable *Base,
  }
 }
+InstX8632AdjustStack::InstX8632AdjustStack(Cfg *Func, SizeT Amount)
+    : InstX8632(Func, InstX8632::Adjuststack, 0, NULL), Amount(Amount) {}
 InstX8632Mul::InstX8632Mul(Cfg *Func, Variable *Dest, Variable *Source1,
                           Operand *Source2)
    : InstX8632(Func, InstX8632::Mul, 2, Dest) {
@@ -226,6 +229,12 @@ InstX8632Movp::InstX8632Movp(Cfg *Func, Variable *Dest, Operand *Source)
  addSource(Source);
 }
+InstX8632StoreP::InstX8632StoreP(Cfg *Func, Operand *Value, OperandX8632 *Mem)
+    : InstX8632(Func, InstX8632::StoreP, 2, NULL) {
+  addSource(Value);
+  addSource(Mem);
+}
 InstX8632StoreQ::InstX8632StoreQ(Cfg *Func, Operand *Value, OperandX8632 *Mem)
    : InstX8632(Func, InstX8632::StoreQ, 2, NULL) {
  addSource(Value);
@@ -933,6 +942,24 @@ void InstX8632Store::dump(const Cfg *Func) const {
  getSrc(0)->dump(Func);
 }
+void InstX8632StoreP::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Str << "\tmovups\t";
+  getSrc(1)->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+void InstX8632StoreP::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "storep." << getSrc(0)->getType() << " ";
+  getSrc(1)->dump(Func);
+  Str << ", ";
+  getSrc(0)->dump(Func);
+}
 void InstX8632StoreQ::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 2);
@@ -1236,6 +1263,17 @@ void InstX8632Pop::dump(const Cfg *Func) const {
  Str << " = pop." << getDest()->getType() << " ";
 }
+void InstX8632AdjustStack::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\tsub\tesp, " << Amount << "\n";
+  Func->getTarget()->updateStackAdjustment(Amount);
+}
+void InstX8632AdjustStack::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "esp = sub.i32 esp, " << Amount;
+}
 void InstX8632Push::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 1);

--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -137,6 +137,7 @@ public:
    Add,
    Addps,
    Addss,
+    Adjuststack,
    And,
    Blendvps,
    Br,
@@ -204,6 +205,7 @@ public:
    Shufps,
    Sqrtss,
    Store,
+    StoreP,
    StoreQ,
    Sub,
    Subps,
@@ -340,6 +342,26 @@ private:
  InstX8632Label *Label; // Intra-block branch target
 };
+// AdjustStack instruction - subtracts esp by the given amount and
+// updates the stack offset during code emission.
+class InstX8632AdjustStack : public InstX8632 {
+public:
+  static InstX8632AdjustStack *create(Cfg *Func, SizeT Amount) {
+    return new (Func->allocate<InstX8632AdjustStack>())
+        InstX8632AdjustStack(Func, Amount);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Adjuststack); }
+private:
+  InstX8632AdjustStack(Cfg *Func, SizeT Amount);
+  InstX8632AdjustStack(const InstX8632AdjustStack &) LLVM_DELETED_FUNCTION;
+  InstX8632AdjustStack &operator=(const InstX8632AdjustStack &)
+      LLVM_DELETED_FUNCTION;
+  SizeT Amount;
+};
 // Call instruction.  Arguments should have already been pushed.
 class InstX8632Call : public InstX8632 {
 public:
@@ -960,6 +982,23 @@ private:
  virtual ~InstX8632Movp() {}
 };
+class InstX8632StoreP : public InstX8632 {
+public:
+  static InstX8632StoreP *create(Cfg *Func, Operand *Value, OperandX8632 *Mem) {
+    return new (Func->allocate<InstX8632StoreP>())
+        InstX8632StoreP(Func, Value, Mem);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, StoreP); }
+private:
+  InstX8632StoreP(Cfg *Func, Operand *Value, OperandX8632 *Mem);
+  InstX8632StoreP(const InstX8632StoreP &) LLVM_DELETED_FUNCTION;
+  InstX8632StoreP &operator=(const InstX8632StoreP &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632StoreP() {}
+};
 // This is essentially a "movq" instruction with an OperandX8632Mem
 // operand instead of Variable as the destination.  It's important
 // for liveness that there is no Dest operand.

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -188,6 +188,9 @@ protected:
  void _add(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Add::create(Func, Dest, Src0));
  }
+  void _adjust_stack(int32_t Amount) {
+    Context.insert(InstX8632AdjustStack::create(Func, Amount));
+  }
  void _addps(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Addps::create(Func, Dest, Src0));
  }
@@ -412,6 +415,9 @@ protected:
  void _store(Operand *Value, OperandX8632 *Mem) {
    Context.insert(InstX8632Store::create(Func, Value, Mem));
  }
+  void _storep(Operand *Value, OperandX8632 *Mem) {
+    Context.insert(InstX8632StoreP::create(Func, Value, Mem));
+  }
  void _storeq(Operand *Value, OperandX8632 *Mem) {
    Context.insert(InstX8632StoreQ::create(Func, Value, Mem));
  }
@@ -450,6 +456,7 @@ protected:
  const X86InstructionSet InstructionSet;
  bool IsEbpBasedFrame;
+  bool NeedsStackAlignment;
  size_t FrameSizeLocals;
  size_t LocalsSizeBytes;
  llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];

--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -33,40 +33,50 @@ entry:
  ret i32 %add3
 }
 ; CHECK: pass64BitArg:
-; CHECK:      push    123
+; CHECK:      sub     esp
-; CHECK-NEXT: push
+; CHECK:      mov     dword ptr [esp+4]
-; CHECK-NEXT: push
+; CHECK:      mov     dword ptr [esp]
-; CHECK-NEXT: call    ignore64BitArgNoInline
+; CHECK:      mov     dword ptr [esp+8], 123
-; CHECK:      push
+; CHECK:      mov     dword ptr [esp+16]
-; CHECK-NEXT: push
+; CHECK:      mov     dword ptr [esp+12]
-; CHECK-NEXT: push    123
+; CHECK:      call    ignore64BitArgNoInline
-; CHECK-NEXT: push
+; CHECK       sub     esp
-; CHECK-NEXT: push
+; CHECK:      mov     dword ptr [esp+4]
-; CHECK-NEXT: call    ignore64BitArgNoInline
+; CHECK:      mov     dword ptr [esp]
-; CHECK:      push
+; CHECK:      mov     dword ptr [esp+8], 123
-; CHECK-NEXT: push
+; CHECK:      mov     dword ptr [esp+16]
-; CHECK-NEXT: push    123
+; CHECK:      mov     dword ptr [esp+12]
-; CHECK-NEXT: push
+; CHECK:      call    ignore64BitArgNoInline
-; CHECK-NEXT: push
+; CHECK:      sub     esp
-; CHECK-NEXT: call    ignore64BitArgNoInline
+; CHECK:      mov     dword ptr [esp+4]
+; CHECK:      mov     dword ptr [esp]
+; CHECK:      mov     dword ptr [esp+8], 123
+; CHECK:      mov     dword ptr [esp+16]
+; CHECK:      mov     dword ptr [esp+12]
+; CHECK:      call    ignore64BitArgNoInline
 ;
 ; OPTM1: pass64BitArg:
-; OPTM1:      push    123
+; OPTM1:      sub     esp
-; OPTM1-NEXT: push
+; OPTM1:      mov     dword ptr [esp+4]
-; OPTM1-NEXT: push
+; OPTM1:      mov     dword ptr [esp]
-; OPTM1-NEXT: call    ignore64BitArgNoInline
+; OPTM1:      mov     dword ptr [esp+8], 123
-; OPTM1:      push
+; OPTM1:      mov     dword ptr [esp+16]
-; OPTM1-NEXT: push
+; OPTM1:      mov     dword ptr [esp+12]
-; OPTM1-NEXT: push    123
+; OPTM1:      call    ignore64BitArgNoInline
-; OPTM1-NEXT: push
+; OPTM1       sub     esp
-; OPTM1-NEXT: push
+; OPTM1:      mov     dword ptr [esp+4]
-; OPTM1-NEXT: call    ignore64BitArgNoInline
+; OPTM1:      mov     dword ptr [esp]
-; OPTM1:      push
+; OPTM1:      mov     dword ptr [esp+8], 123
-; OPTM1-NEXT: push
+; OPTM1:      mov     dword ptr [esp+16]
-; OPTM1-NEXT: push    123
+; OPTM1:      mov     dword ptr [esp+12]
-; OPTM1-NEXT: push
+; OPTM1:      call    ignore64BitArgNoInline
-; OPTM1-NEXT: push
+; OPTM1:      sub     esp
-; OPTM1-NEXT: call    ignore64BitArgNoInline
+; OPTM1:      mov     dword ptr [esp+4]
+; OPTM1:      mov     dword ptr [esp]
+; OPTM1:      mov     dword ptr [esp+8], 123
+; OPTM1:      mov     dword ptr [esp+16]
+; OPTM1:      mov     dword ptr [esp+12]
+; OPTM1:      call    ignore64BitArgNoInline
 declare i32 @ignore64BitArgNoInline(i64, i32, i64)
@@ -76,19 +86,21 @@ entry:
  ret i32 %call
 }
 ; CHECK: pass64BitConstArg:
-; CHECK:      push    3735928559
+; CHECK:      sub     esp
-; CHECK-NEXT: push    305419896
+; CHECK:      mov     dword ptr [esp+4]
-; CHECK-NEXT: push    123
+; CHECK-NEXT: mov     dword ptr [esp]
-; CHECK-NEXT: push    ecx
+; CHECK-NEXT: mov     dword ptr [esp+8], 123
-; CHECK-NEXT: push    eax
+; CHECK-NEXT: mov     dword ptr [esp+16], 3735928559
+; CHECK-NEXT: mov     dword ptr [esp+12], 305419896
 ; CHECK-NEXT: call    ignore64BitArgNoInline
 ;
 ; OPTM1: pass64BitConstArg:
-; OPTM1:      push    3735928559
+; OPTM1:      sub     esp
-; OPTM1-NEXT: push    305419896
+; OPTM1:      mov     dword ptr [esp+4]
-; OPTM1-NEXT: push    123
+; OPTM1-NEXT: mov     dword ptr [esp]
-; OPTM1-NEXT: push    dword ptr [
+; OPTM1-NEXT: mov     dword ptr [esp+8], 123
-; OPTM1-NEXT: push    dword ptr [
+; OPTM1-NEXT: mov     dword ptr [esp+16], 3735928559
+; OPTM1-NEXT: mov     dword ptr [esp+12], 305419896
 ; OPTM1-NEXT: call    ignore64BitArgNoInline
 define internal i64 @return64BitArg(i64 %a) {
@@ -240,14 +252,14 @@ entry:
  ret i64 %div
 }
 ; CHECK-LABEL: div64BitSignedConst:
-; CHECK: push 2874
+; CHECK: mov     dword ptr [esp+12], 2874
-; CHECK: push 1942892530
+; CHECK: mov     dword ptr [esp+8],  1942892530
 ; CHECK: call    __divdi3
 ; CHECK: ret
 ;
 ; OPTM1-LABEL: div64BitSignedConst:
-; OPTM1: push 2874
+; OPTM1: mov     dword ptr [esp+12], 2874
-; OPTM1: push 1942892530
+; OPTM1: mov     dword ptr [esp+8],  1942892530
 ; OPTM1: call    __divdi3
 ; OPTM1: ret

--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
-; This is a basic test of the alloca instruction - one test for alloca
+; This is a basic test of the alloca instruction.
-; of a fixed size, and one test for variable size.
 ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
-; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -O2 --verbose none %s \
 ; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
 ; RUN: %llvm2ice -Om1 --verbose none %s \
@@ -12,45 +11,95 @@
 ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
 ; RUN:                           | FileCheck --check-prefix=DUMP %s
-define void @fixed_400(i32 %n) {
+define void @fixed_416_align_16(i32 %n) {
 entry:
-  %array = alloca i8, i32 400, align 16
+  %array = alloca i8, i32 416, align 16
  %__2 = ptrtoint i8* %array to i32
  call void @f1(i32 %__2)
  ret void
 }
-; CHECK: fixed_400:
+; CHECK-LABEL: fixed_416_align_16:
-; CHECK:      sub     esp, 400
+; CHECK:      sub     esp, 416
-; CHECK-NEXT: mov     eax, esp
+; CHECK:      sub     esp, 16
-; CHECK-NEXT: push    eax
+; CHECK:      mov     dword ptr [esp], eax
-; CHECK-NEXT: call    f1
+; CHECK:      call    f1
-;
-; OPTM1: fixed_400:
+define void @fixed_416_align_32(i32 %n) {
-; OPTM1:      sub     esp, 400
+entry:
-; OPTM1-NEXT: mov     {{.*}}, esp
+  %array = alloca i8, i32 400, align 32
-; OPTM1:      push
+  %__2 = ptrtoint i8* %array to i32
-; OPTM1-NEXT: call    f1
+  call void @f1(i32 %__2)
+  ret void
+}
+; CHECK-LABEL: fixed_416_align_32:
+; CHECK:      and     esp, 4294967264
+; CHECK:      sub     esp, 416
+; CHECK:      sub     esp, 16
+; CHECK:      mov     dword ptr [esp], eax
+; CHECK:      call    f1
+define void @fixed_351_align_16(i32 %n) {
+entry:
+  %array = alloca i8, i32 351, align 16
+  %__2 = ptrtoint i8* %array to i32
+  call void @f1(i32 %__2)
+  ret void
+}
+; CHECK-LABEL: fixed_351_align_16:
+; CHECK:      sub     esp, 352
+; CHECK:      sub     esp, 16
+; CHECK:      mov     dword ptr [esp], eax
+; CHECK:      call    f1
+define void @fixed_351_align_32(i32 %n) {
+entry:
+  %array = alloca i8, i32 351, align 32
+  %__2 = ptrtoint i8* %array to i32
+  call void @f1(i32 %__2)
+  ret void
+}
+; CHECK-LABEL: fixed_351_align_32:
+; CHECK:      and     esp, 4294967264
+; CHECK:      sub     esp, 352
+; CHECK:      sub     esp, 16
+; CHECK:      mov     dword ptr [esp], eax
+; CHECK:      call    f1
 declare void @f1(i32)
-define void @variable_n(i32 %n) {
+define void @variable_n_align_16(i32 %n) {
 entry:
  %array = alloca i8, i32 %n, align 16
  %__2 = ptrtoint i8* %array to i32
  call void @f2(i32 %__2)
  ret void
 }
-; CHECK: variable_n:
+; CHECK-LABEL: variable_n_align_16:
 ; CHECK:      mov     eax, dword ptr [ebp+8]
-; CHECK-NEXT: sub     esp, eax
+; CHECK:      add     eax, 15
-; CHECK-NEXT: mov     eax, esp
+; CHECK:      and     eax, 4294967280
-; CHECK-NEXT: push    eax
+; CHECK:      sub     esp, eax
-; CHECK-NEXT: call    f2
+; CHECK:      sub     esp, 16
-;
+; CHECK:      mov     dword ptr [esp], eax
-; OPTM1: variable_n:
+; CHECK:      call    f2
-; OPTM1:      mov     {{.*}}, esp
-; OPTM1:      push
+define void @variable_n_align_32(i32 %n) {
-; OPTM1-NEXT: call    f2
+entry:
+  %array = alloca i8, i32 %n, align 32
+  %__2 = ptrtoint i8* %array to i32
+  call void @f2(i32 %__2)
+  ret void
+}
+; In -O2, the order of the CHECK-DAG lines in the output is switched.
+; CHECK-LABEL: variable_n_align_32:
+; CHECK-DAG:  and     esp, 4294967264
+; CHECK-DAG:  mov     eax, dword ptr [ebp+8]
+; CHECK:      add     eax, 31
+; CHECK:      and     eax, 4294967264
+; CHECK:      sub     esp, eax
+; CHECK:      sub     esp, 16
+; CHECK:      mov     dword ptr [esp], eax
+; CHECK:      call    f2
 declare void @f2(i32)

--- a/tests_lit/llvm2ice_tests/ebp_args.ll
+++ b/tests_lit/llvm2ice_tests/ebp_args.ll
@@ -22,18 +22,23 @@ entry:
 ; lowering code changes.
 ; CHECK: memcpy_helper:
-; CHECK: push     ebp
+; CHECK:  push     ebx
-; CHECK: mov      ebp, esp
+; CHECK:  push     ebp
-; CHECK: sub      esp, 20
+; CHECK:  mov      ebp, esp
-; CHECK: mov      eax, dword ptr [ebp+12]
+; CHECK:  sub      esp, 20
-; CHECK: mov      dword ptr [ebp-4], eax
+; CHECK:  mov      eax, dword ptr [ebp+16]
-; CHECK: sub      esp, 128
+; CHECK:  mov      dword ptr [ebp-4], eax
-; CHECK: mov      dword ptr [ebp-8], esp
+; CHECK:  sub      esp, 128
-; CHECK: mov      eax, dword ptr [ebp-8]
+; CHECK:  mov      dword ptr [ebp-8], esp
-; CHECK: mov      dword ptr [ebp-12], eax
+; CHECK:  mov      eax, dword ptr [ebp-8]
-; CHECK: movzx    eax, byte ptr [ebp-4]
+; CHECK:  mov      dword ptr [ebp-12], eax
-; CHECK: mov      dword ptr [ebp-16], eax
+; CHECK:  movzx    eax, byte ptr [ebp-4]
-; CHECK: push     dword ptr [ebp-16]
+; CHECK:  mov      dword ptr [ebp-16], eax
-; CHECK: push     dword ptr [ebp-12]
+; CHECK:  sub      esp, 16
-; CHECK: push     dword ptr [ebp+8]
+; CHECK:  mov      ecx, dword ptr [ebp+12]
-; CHECK: call     memcpy_helper2
+; CHECK:  mov      dword ptr [esp], ecx
+; CHECK:  mov      edx, dword ptr [ebp-12]
+; CHECK:  mov      dword ptr [esp+4], edx
+; CHECK:  mov      ebx, dword ptr [ebp-16]
+; CHECK:  mov      dword ptr [esp+8], ebx
+; CHECK:  call     memcpy_helper2
--- a/tests_lit/llvm2ice_tests/fp.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/fp.pnacl.ll
@@ -45,11 +45,11 @@ entry:
  ret i32 %add3
 }
 ; CHECK-LABEL: passFpArgs
-; CHECK: push 123
+; CHECK: mov dword ptr [esp+4], 123
 ; CHECK: call ignoreFpArgsNoInline
-; CHECK: push 123
+; CHECK: mov dword ptr [esp+4], 123
 ; CHECK: call ignoreFpArgsNoInline
-; CHECK: push 123
+; CHECK: mov dword ptr [esp+4], 123
 ; CHECK: call ignoreFpArgsNoInline
 declare i32 @ignoreFpArgsNoInline(float, i32, double)
@@ -60,7 +60,7 @@ entry:
  ret i32 %call
 }
 ; CHECK-LABEL: passFpConstArg
-; CHECK: push 123
+; CHECK: mov dword ptr [esp+4], 123
 ; CHECK: call ignoreFpArgsNoInline
 define internal i32 @passFp32ConstArg(float %a) {
@@ -69,8 +69,8 @@ entry:
  ret i32 %call
 }
 ; CHECK-LABEL: passFp32ConstArg
-; CHECK: push dword
+; CHECK: mov dword ptr [esp+4], 123
-; CHECK: push 123
+; CHECK: movss dword ptr [esp+8]
 ; CHECK: call ignoreFp32ArgsNoInline
 declare i32 @ignoreFp32ArgsNoInline(float, i32, float)
@@ -415,8 +415,8 @@ entry:
  ret double %conv
 }
 ; CHECK-LABEL: unsigned64ToDouble
-; CHECK: push 2874
+; CHECK: mov dword ptr [esp+4], 2874
-; CHECK: push 1942892530
+; CHECK: mov dword ptr [esp], 1942892530
 ; CHECK: call cvtui64tod
 ; CHECK: fstp

--- a/tests_lit/llvm2ice_tests/undef.ll
+++ b/tests_lit/llvm2ice_tests/undef.ll
@@ -37,8 +37,7 @@ define float @undef_float() {
 entry:
  ret float undef
 ; CHECK-LABEL: undef_float:
-; CHECK-NOT: sub esp
+; CHECK: [L$float$
-; CHECK: fld
 }
 define <4 x i1> @undef_v4i1() {

--- a/tests_lit/llvm2ice_tests/vector-arg.ll
+++ b/tests_lit/llvm2ice_tests/vector-arg.ll
 ; This file checks that Subzero generates code in accordance with the
 ; calling convention for vectors.
-; NOTE: CHECK / OPTM1 lines containing the following strings may be
-; subject to change:
-;
-; * movups: The movups instruction may be changed to movaps when the
-; load / store operation is 16 byte aligned.
-;
-; * stack offsets: These may need to be changed if stack alignment
-; support is implemented.
-;
-; * stack adjustment operations
 ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
 ; RUN: %llvm2ice -O2 --verbose none %s \
@@ -150,7 +139,7 @@ define <4 x float> @test_returning_interspersed_arg4(i32 %i32arg0, double %doubl
 entry:
  ret <4 x float> %arg4
 ; CHECK-LABEL: test_returning_interspersed_arg4:
-; CHECK: movups xmm0, xmmword ptr [esp+44]
+; CHECK: movups xmm0, xmmword ptr [esp+52]
 ; CHECK: ret
 ; OPTM1-LABEL: test_returning_interspersed_arg4:
@@ -172,33 +161,69 @@ entry:
  call void @VectorArgs(<4 x float> %arg9, <4 x float> %arg8, <4 x float> %arg7, <4 x float> %arg6, <4 x float> %arg5, <4 x float> %arg4)
  ret void
 ; CHECK-LABEL: test_passing_vectors:
-; CHECK: movups  [[ARG6:.*]], xmmword ptr [esp+4]
+; CHECK: sub esp, 32
-; CHECK: sub esp, 16
+; CHECK: movups  [[ARG5:.*]], xmmword ptr [esp+64]
-; CHECK-NEXT: movups xmmword ptr [esp], [[ARG6]]
+; CHECK: movups  xmmword ptr [esp], [[ARG5]]
-; CHECK: movups  [[ARG5:.*]], xmmword ptr [esp+36]
+; CHECK: movups  [[ARG6:.*]], xmmword ptr [esp+48]
-; CHECK: sub esp, 16
+; CHECK: movups  xmmword ptr [esp+16], [[ARG6]]
-; CHECK-NEXT: movups xmmword ptr [esp], [[ARG5]]
+; CHECK: movups  xmm0, xmmword ptr [esp+128]
-; CHECK: movups  xmm0, xmmword ptr [esp+116]
+; CHECK: movups  xmm1, xmmword ptr [esp+112]
-; CHECK: movups  xmm1, xmmword ptr [esp+100]
+; CHECK: movups  xmm2, xmmword ptr [esp+96]
-; CHECK: movups  xmm2, xmmword ptr [esp+84]
+; CHECK: movups  xmm3, xmmword ptr [esp+80]
-; CHECK: movups  xmm3, xmmword ptr [esp+68]
 ; CHECK: call VectorArgs
 ; CHECK-NEXT: add esp, 32
 ; CHECK: ret
 ; OPTM1-LABEL: test_passing_vectors:
-; OPTM1: movups  [[ARG6:.*]], xmmword ptr {{.*}}
+; OPTM1: sub esp, 32
-; OPTM1: sub esp, 16
-; OPTM1: movups xmmword ptr [esp], [[ARG6]]
 ; OPTM1: movups  [[ARG5:.*]], xmmword ptr {{.*}}
-; OPTM1: sub esp, 16
+; OPTM1: movups  xmmword ptr [esp], [[ARG5]]
-; OPTM1-NEXT: movups xmmword ptr [esp], [[ARG5]]
+; OPTM1: movups  [[ARG6:.*]], xmmword ptr {{.*}}
+; OPTM1: movups  xmmword ptr [esp+16], [[ARG6]]
 ; OPTM1: movups  xmm0, xmmword ptr {{.*}}
 ; OPTM1: movups  xmm1, xmmword ptr {{.*}}
 ; OPTM1: movups  xmm2, xmmword ptr {{.*}}
 ; OPTM1: movups  xmm3, xmmword ptr {{.*}}
 ; OPTM1: call VectorArgs
-; OPTM1: add esp, 32
+; OPTM1-NEXT: add esp, 32
+; OPTM1: ret
+}
+declare void @InterspersedVectorArgs(<4 x float>, i64, <4 x float>, i64, <4 x float>, float, <4 x float>, double, <4 x float>, i32, <4 x float>)
+define void @test_passing_vectors_interspersed(<4 x float> %arg0, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3, <4 x float> %arg4, <4 x float> %arg5, <4 x float> %arg6, <4 x float> %arg7, <4 x float> %arg8, <4 x float> %arg9) {
+entry:
+  ; Kills XMM registers so that no in-arg lowering code interferes
+  ; with the test.
+  call void @killXmmRegisters()
+  call void @InterspersedVectorArgs(<4 x float> %arg9, i64 0, <4 x float> %arg8, i64 1, <4 x float> %arg7, float 2.000000e+00, <4 x float> %arg6, double 3.000000e+00, <4 x float> %arg5, i32 4, <4 x float> %arg4)
+  ret void
+; CHECK-LABEL: test_passing_vectors_interspersed:
+; CHECK: sub esp, 80
+; CHECK: movups  [[ARG9:.*]], xmmword ptr [esp+112]
+; CHECK: movups  xmmword ptr [esp+32], [[ARG9]]
+; CHECK: movups  [[ARG11:.*]], xmmword ptr [esp+96]
+; CHECK: movups  xmmword ptr [esp+64], [[ARG11]]
+; CHECK: movups  xmm0, xmmword ptr [esp+176]
+; CHECK: movups  xmm1, xmmword ptr [esp+160]
+; CHECK: movups  xmm2, xmmword ptr [esp+144]
+; CHECK: movups  xmm3, xmmword ptr [esp+128]
+; CHECK: call InterspersedVectorArgs
+; CHECK-NEXT: add esp, 80
+; CHECK: ret
+; OPTM1-LABEL: test_passing_vectors_interspersed:
+; OPTM1: sub esp, 80
+; OPTM1: movups  [[ARG9:.*]], xmmword ptr {{.*}}
+; OPTM1: movups  xmmword ptr [esp+32], [[ARG9]]
+; OPTM1: movups  [[ARG11:.*]], xmmword ptr {{.*}}
+; OPTM1: movups  xmmword ptr [esp+64], [[ARG11]]
+; OPTM1: movups  xmm0, xmmword ptr {{.*}}
+; OPTM1: movups  xmm1, xmmword ptr {{.*}}
+; OPTM1: movups  xmm2, xmmword ptr {{.*}}
+; OPTM1: movups  xmm3, xmmword ptr {{.*}}
+; OPTM1: call InterspersedVectorArgs
+; OPTM1-NEXT: add esp, 80
 ; OPTM1: ret
 }
@@ -220,8 +245,8 @@ entry:
 ; OPTM1-LABEL: test_receiving_vectors:
 ; OPTM1: call VectorReturn
-; OPTM1: movups [[LOC:.*]], xmm0
+; OPTM1: movups {{.*}}, xmm0
-; OPTM1: movups xmm0, [[LOC]]
+; OPTM1: movups xmm0, {{.*}}
 ; OPTM1: call VectorReturn
 ; OPTM1: ret
 }