Subzero: Align the stack at the point of function calls.

Be compatible with the x86-32 calling convention by ensuring that the stack is aligned to 16 bytes at the point of the call instruction. Also ensure that vector arguments passed on the stack are 16 byte aligned. Also, make alloca instructions respect alignment. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/444443002

Subzero: Align the stack at the point of function calls.
105b7044 · Matt Wala · 8835b89b · 105b7044 · 105b7044 · 105b7044
Commit 105b7044 authored Aug 11, 2014 by Matt Wala
16 changed files
--- a/crosstest/crosstest.py
+++ b/crosstest/crosstest.py
@@ -134,7 +134,6 @@ if __name__ == '__main__':
            objs.append(bitcode)
    linker = 'clang' if os.path.splitext(args.driver)[1] == '.c' else 'clang++'
-    # TODO: Remove -mstackrealign after Subzero supports stack alignment.
+    shellcmd([os.path.join(llvm_bin_path, linker), '-g', '-m32', args.driver] +
-    shellcmd([os.path.join(llvm_bin_path, linker), '-g', '-m32',
+             objs +
-              '-mstackrealign', args.driver] + objs +
             ['-lm', '-lpthread', '-o', os.path.join(args.dir, args.output)])
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -54,6 +54,14 @@ for optlevel in ${OPTLEVELS} ; do
            --output=test_bitmanip_O${optlevel}_${attribute}
        ./crosstest.py -O${optlevel} --mattr ${attribute} \
+            --prefix=Subzero_ --target=x8632 \
+            --dir="${OUTDIR}" \
+            --llvm-bin-path="${LLVM_BIN_PATH}" \
+            --test=test_calling_conv.cpp \
+            --driver=test_calling_conv_main.cpp \
+            --output=test_calling_conv_O${optlevel}_${attribute}
+        ./crosstest.py -O${optlevel} --mattr ${attribute} \
            --prefix=Subzero_ \
            --target=x8632 \
            --dir="${OUTDIR}" \
@@ -137,6 +145,7 @@ for optlevel in ${OPTLEVELS} ; do
        "${OUTDIR}"/mem_intrin_O${optlevel}_${attribute}
        "${OUTDIR}"/test_arith_O${optlevel}_${attribute}
        "${OUTDIR}"/test_bitmanip_O${optlevel}_${attribute}
+        "${OUTDIR}"/test_calling_conv_O${optlevel}_${attribute}
        "${OUTDIR}"/test_cast_O${optlevel}_${attribute}
        "${OUTDIR}"/test_fcmp_O${optlevel}_${attribute}
        "${OUTDIR}"/test_global_O${optlevel}_${attribute}

--- a/crosstest/test_calling_conv.cpp
+++ b/crosstest/test_calling_conv.cpp
+//===- subzero/crosstest/test_calling_conv.cpp - Implementation for tests -===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the test functions used to check that Subzero
+// generates code compatible with the calling convention used by
+// llc. "Caller" functions test the handling of out-args, and "callee"
+// functions test the handling of in-args.
+//
+//===----------------------------------------------------------------------===//
+#include <cstring>
+#include "test_calling_conv.h"
+#define CALL_AS_TYPE(Ty, Func) (reinterpret_cast<Ty *>(Func))
+void caller_i(void) {
+  int arg1 = 0x12345678;
+  CALL_AS_TYPE(callee_i_Ty, Callee)(arg1);
+}
+void caller_vvvvv(void) {
+  v4si32 arg1 = {0, 1, 2, 3};
+  v4si32 arg2 = {4, 5, 6, 7};
+  v4si32 arg3 = {8, 9, 10, 11};
+  v4si32 arg4 = {12, 13, 14, 15};
+  v4si32 arg5 = {16, 17, 18, 19};
+  CALL_AS_TYPE(callee_vvvvv_Ty, Callee)(arg1, arg2, arg3, arg4, arg5);
+}
+void caller_vlvlivfvdviv(void) {
+  v4f32 arg1 = {0, 1, 2, 3};
+  int64_t arg2 = 4;
+  v4f32 arg3 = {6, 7, 8, 9};
+  int64_t arg4 = 10;
+  int arg5 = 11;
+  v4f32 arg6 = {12, 13, 14, 15};
+  float arg7 = 16;
+  v4f32 arg8 = {17, 18, 19, 20};
+  double arg9 = 21;
+  v4f32 arg10 = {22, 23, 24, 25};
+  int arg11 = 26;
+  v4f32 arg12 = {27, 28, 29, 30};
+  CALL_AS_TYPE(callee_vlvlivfvdviv_Ty, Callee)(arg1, arg2, arg3, arg4, arg5,
+                                               arg6, arg7, arg8, arg9, arg10,
+                                               arg11, arg12);
+}
+#define HANDLE_ARG(ARGNUM)                                                     \
+  case ARGNUM:                                                                 \
+    memcpy(&Buf[0], &arg##ARGNUM, sizeof(arg##ARGNUM));                        \
+    break;
+void __attribute__((noinline)) callee_i(int arg1) {
+  switch (ArgNum) { HANDLE_ARG(1); }
+}
+void __attribute__((noinline))
+callee_vvvvv(v4si32 arg1, v4si32 arg2, v4si32 arg3, v4si32 arg4, v4si32 arg5) {
+  switch (ArgNum) {
+    HANDLE_ARG(1);
+    HANDLE_ARG(2);
+    HANDLE_ARG(3);
+    HANDLE_ARG(4);
+    HANDLE_ARG(5);
+  }
+}
+void __attribute__((noinline))
+callee_vlvlivfvdviv(v4f32 arg1, int64_t arg2, v4f32 arg3, int64_t arg4, int arg5,
+                    v4f32 arg6, float arg7, v4f32 arg8, double arg9, v4f32 arg10,
+                    int arg11, v4f32 arg12) {
+  switch (ArgNum) {
+    HANDLE_ARG(1);
+    HANDLE_ARG(2);
+    HANDLE_ARG(3);
+    HANDLE_ARG(4);
+    HANDLE_ARG(5);
+    HANDLE_ARG(6);
+    HANDLE_ARG(7);
+    HANDLE_ARG(8);
+    HANDLE_ARG(9);
+    HANDLE_ARG(10);
+    HANDLE_ARG(11);
+    HANDLE_ARG(12);
+  }
+}
--- a/crosstest/test_calling_conv.def
+++ b/crosstest/test_calling_conv.def
+//===- subzero/crosstest/test_calling_conv.def - testing macros -*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines macros for testing the calling convention.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TEST_CALLING_CONV_DEF
+#define TEST_CALLING_CONV_DEF
+#define STR(x) (#x)
+#define TEST_FUNC_TABLE                            \
+/* caller,             callee,           argc */   \
+X(caller_i,            callee_i,              1)   \
+X(caller_vvvvv,        callee_vvvvv,          5)   \
+X(caller_vlvlivfvdviv, callee_vlvlivfvdviv,  12)   \
+// #define X(caller, callee, argc)
+#endif // TEST_CALLING_CONV_DEF
--- a/crosstest/test_calling_conv.h
+++ b/crosstest/test_calling_conv.h
+//===- subzero/crosstest/test_calling_conv.h - Test prototypes --*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for crosstesting the calling
+// convention.
+//
+//===----------------------------------------------------------------------===//
+#include "test_calling_conv.def"
+#include "vectors.h"
+typedef void (*CalleePtrTy)();
+extern CalleePtrTy Callee;
+extern size_t ArgNum;
+extern char *Buf;
+void caller_i();
+void caller_alloca_i();
+typedef void callee_i_Ty(int);
+callee_i_Ty callee_i;
+callee_i_Ty callee_alloca_i;
+void caller_vvvvv();
+typedef void (callee_vvvvv_Ty)(v4si32, v4si32, v4si32, v4si32, v4si32);
+callee_vvvvv_Ty callee_vvvvv;
+void caller_vlvlivfvdviv();
+typedef void(callee_vlvlivfvdviv_Ty)(v4f32, int64_t, v4f32, int64_t, int, v4f32,
+                                    float, v4f32, double, v4f32, int, v4f32);
+callee_vlvlivfvdviv_Ty callee_vlvlivfvdviv;
--- a/crosstest/test_calling_conv_main.cpp
+++ b/crosstest/test_calling_conv_main.cpp
+//===- subzero/crosstest/test_calling_conv_main.cpp - Driver for tests ----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the driver for cross testing the compatibility of
+// calling conventions.
+//
+//===----------------------------------------------------------------------===//
+/* crosstest.py --test=test_calling_conv.cpp               \
+   --driver=test_calling_conv_main.cpp --prefix=Subzero_   \
+   --output=test_calling_conv */
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include "test_calling_conv.h"
+namespace Subzero_ {
+#include "test_calling_conv.h"
+}
+// The crosstest code consists of caller / callee function pairs.
+//
+// The caller function initializes a list of arguments and calls the
+// function located at Callee.
+//
+// The callee function writes the argument numbered ArgNum into the
+// location pointed to by Buf.
+//
+// testCaller() tests that caller functions, as compiled by Subzero and
+// llc, pass arguments to the callee in the same way.  The Caller() and
+// Subzero_Caller() functions both call the same callee (which has been
+// compiled by llc).  The result in the global buffer is compared to
+// check that it is the same value after the calls by both callers.
+//
+// testCallee() runs the same kind of test, except that the functions
+// Callee() and Subzero_Callee() are being tested to ensure that both
+// functions receive arguments from the caller in the same way.  The
+// caller is compiled by llc.
+size_t ArgNum, Subzero_ArgNum;
+CalleePtrTy Callee, Subzero_Callee;
+char *Buf, *Subzero_Buf;
+const static size_t BUF_SIZE = 16;
+std::string bufAsString(const char Buf[BUF_SIZE]) {
+  std::ostringstream OS;
+  for (size_t i = 0; i < BUF_SIZE; ++i) {
+    if (i > 0)
+      OS << " ";
+    OS << (unsigned) Buf[i];
+  }
+  return OS.str();
+}
+void testCaller(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  static struct {
+    const char *CallerName, *CalleeName;
+    size_t Args;
+    void (*Caller)(void);
+    void (*Subzero_Caller)(void);
+    CalleePtrTy Callee;
+  } Funcs[] = {
+#define X(caller, callee, argc)                                                \
+  {                                                                            \
+    STR(caller), STR(callee), argc, &caller, &Subzero_::caller,                \
+        reinterpret_cast<CalleePtrTy>(&callee),                                \
+  }                                                                            \
+  ,
+    TEST_FUNC_TABLE
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    char BufLlc[BUF_SIZE], BufSz[BUF_SIZE];
+    Callee = Subzero_Callee = Funcs[f].Callee;
+    for (size_t i = 0; i < Funcs[f].Args; ++i) {
+      memset(BufLlc, 0xff, sizeof(BufLlc));
+      memset(BufSz, 0xff, sizeof(BufSz));
+      ArgNum = Subzero_ArgNum = i;
+      Buf = BufLlc;
+      Funcs[f].Caller();
+      Buf = BufSz;
+      Funcs[f].Subzero_Caller();
+      ++TotalTests;
+      if (!memcmp(BufLlc, BufSz, sizeof(BufLlc))) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "testCaller(Caller=" << Funcs[f].CallerName
+                  << ", Callee=" << Funcs[f].CalleeName << ", ArgNum=" << ArgNum
+                  << ")\nsz =" << bufAsString(BufSz)
+                  << "\nllc=" << bufAsString(BufLlc) << "\n";
+      }
+    }
+  }
+}
+void testCallee(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  static struct {
+    const char *CallerName, *CalleeName;
+    size_t Args;
+    void (*Caller)(void);
+    CalleePtrTy Callee, Subzero_Callee;
+  } Funcs[] = {
+#define X(caller, callee, argc)                                                \
+  {                                                                            \
+    STR(caller), STR(callee), argc, &caller,                                   \
+        reinterpret_cast<CalleePtrTy>(&callee),                                \
+        reinterpret_cast<CalleePtrTy>(&Subzero_::callee)                       \
+  }                                                                            \
+  ,
+    TEST_FUNC_TABLE
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    char BufLlc[BUF_SIZE], BufSz[BUF_SIZE];
+    Buf = BufLlc;
+    Subzero_Buf = BufSz;
+    for (size_t i = 0; i < Funcs[f].Args; ++i) {
+      memset(BufLlc, 0xff, sizeof(BufLlc));
+      memset(BufSz, 0xff, sizeof(BufSz));
+      ArgNum = Subzero_ArgNum = i;
+      Callee = Funcs[f].Callee;
+      Funcs[f].Caller();
+      Callee = Funcs[f].Subzero_Callee;
+      Funcs[f].Caller();
+      ++TotalTests;
+      if (!memcmp(BufLlc, BufSz, sizeof(BufLlc))) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "testCallee(Caller=" << Funcs[f].CallerName
+                  << ", Callee=" << Funcs[f].CalleeName << ", ArgNum=" << ArgNum
+                  << ")\nsz =" << bufAsString(BufSz)
+                  << "\nllc=" << bufAsString(BufLlc) << "\n";
+      }
+    }
+  }
+}
+int main(int argc, char *argv[]) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+  testCaller(TotalTests, Passes, Failures);
+  testCallee(TotalTests, Passes, Failures);
+  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+            << " Failures=" << Failures << "\n";
+  return Failures;
+}
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -92,6 +92,9 @@ OperandX8632Mem::OperandX8632Mem(Cfg *Func, Type Ty, Variable *Base,
  }
 }
+InstX8632AdjustStack::InstX8632AdjustStack(Cfg *Func, SizeT Amount)
+    : InstX8632(Func, InstX8632::Adjuststack, 0, NULL), Amount(Amount) {}
 InstX8632Mul::InstX8632Mul(Cfg *Func, Variable *Dest, Variable *Source1,
                           Operand *Source2)
    : InstX8632(Func, InstX8632::Mul, 2, Dest) {
@@ -226,6 +229,12 @@ InstX8632Movp::InstX8632Movp(Cfg *Func, Variable *Dest, Operand *Source)
  addSource(Source);
 }
+InstX8632StoreP::InstX8632StoreP(Cfg *Func, Operand *Value, OperandX8632 *Mem)
+    : InstX8632(Func, InstX8632::StoreP, 2, NULL) {
+  addSource(Value);
+  addSource(Mem);
+}
 InstX8632StoreQ::InstX8632StoreQ(Cfg *Func, Operand *Value, OperandX8632 *Mem)
    : InstX8632(Func, InstX8632::StoreQ, 2, NULL) {
  addSource(Value);
@@ -933,6 +942,24 @@ void InstX8632Store::dump(const Cfg *Func) const {
  getSrc(0)->dump(Func);
 }
+void InstX8632StoreP::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Str << "\tmovups\t";
+  getSrc(1)->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+void InstX8632StoreP::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "storep." << getSrc(0)->getType() << " ";
+  getSrc(1)->dump(Func);
+  Str << ", ";
+  getSrc(0)->dump(Func);
+}
 void InstX8632StoreQ::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 2);
@@ -1236,6 +1263,17 @@ void InstX8632Pop::dump(const Cfg *Func) const {
  Str << " = pop." << getDest()->getType() << " ";
 }
+void InstX8632AdjustStack::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\tsub\tesp, " << Amount << "\n";
+  Func->getTarget()->updateStackAdjustment(Amount);
+}
+void InstX8632AdjustStack::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "esp = sub.i32 esp, " << Amount;
+}
 void InstX8632Push::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 1);

--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -137,6 +137,7 @@ public:
    Add,
    Addps,
    Addss,
+    Adjuststack,
    And,
    Blendvps,
    Br,
@@ -204,6 +205,7 @@ public:
    Shufps,
    Sqrtss,
    Store,
+    StoreP,
    StoreQ,
    Sub,
    Subps,
@@ -340,6 +342,26 @@ private:
  InstX8632Label *Label; // Intra-block branch target
 };
+// AdjustStack instruction - subtracts esp by the given amount and
+// updates the stack offset during code emission.
+class InstX8632AdjustStack : public InstX8632 {
+public:
+  static InstX8632AdjustStack *create(Cfg *Func, SizeT Amount) {
+    return new (Func->allocate<InstX8632AdjustStack>())
+        InstX8632AdjustStack(Func, Amount);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Adjuststack); }
+private:
+  InstX8632AdjustStack(Cfg *Func, SizeT Amount);
+  InstX8632AdjustStack(const InstX8632AdjustStack &) LLVM_DELETED_FUNCTION;
+  InstX8632AdjustStack &operator=(const InstX8632AdjustStack &)
+      LLVM_DELETED_FUNCTION;
+  SizeT Amount;
+};
 // Call instruction.  Arguments should have already been pushed.
 class InstX8632Call : public InstX8632 {
 public:
@@ -960,6 +982,23 @@ private:
  virtual ~InstX8632Movp() {}
 };
+class InstX8632StoreP : public InstX8632 {
+public:
+  static InstX8632StoreP *create(Cfg *Func, Operand *Value, OperandX8632 *Mem) {
+    return new (Func->allocate<InstX8632StoreP>())
+        InstX8632StoreP(Func, Value, Mem);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, StoreP); }
+private:
+  InstX8632StoreP(Cfg *Func, Operand *Value, OperandX8632 *Mem);
+  InstX8632StoreP(const InstX8632StoreP &) LLVM_DELETED_FUNCTION;
+  InstX8632StoreP &operator=(const InstX8632StoreP &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632StoreP() {}
+};
 // This is essentially a "movq" instruction with an OperandX8632Mem
 // operand instead of Variable as the destination.  It's important
 // for liveness that there is no Dest operand.

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -121,9 +121,21 @@ Type getInVectorElementType(Type Ty) {
 }
 // The maximum number of arguments to pass in XMM registers
-const unsigned X86_MAX_XMM_ARGS = 4;
+const uint32_t X86_MAX_XMM_ARGS = 4;
 // The number of bits in a byte
-const unsigned X86_CHAR_BIT = 8;
+const uint32_t X86_CHAR_BIT = 8;
+// Stack alignment
+const uint32_t X86_STACK_ALIGNMENT_BYTES = 16;
+// Size of the return address on the stack
+const uint32_t X86_RET_IP_SIZE_BYTES = 4;
+// Value is a size in bytes.  Return Value adjusted to the next highest
+// multiple of the stack alignment.
+uint32_t applyStackAlignment(uint32_t Value) {
+  // power of 2
+  assert((X86_STACK_ALIGNMENT_BYTES & (X86_STACK_ALIGNMENT_BYTES - 1)) == 0);
+  return (Value + X86_STACK_ALIGNMENT_BYTES - 1) & -X86_STACK_ALIGNMENT_BYTES;
+}
 // Instruction set options
 namespace cl = ::llvm::cl;
@@ -248,8 +260,8 @@ void __attribute__((unused)) xMacroIntegrityCheck() {
 TargetX8632::TargetX8632(Cfg *Func)
    : TargetLowering(Func), InstructionSet(CLInstructionSet),
-      IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0),
+      IsEbpBasedFrame(false), NeedsStackAlignment(false), FrameSizeLocals(0),
-      NextLabelNumber(0), ComputedLiveRanges(false),
+      LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false),
      PhysicalRegisters(VarList(Reg_NUM)) {
  // TODO: Don't initialize IntegerRegisters and friends every time.
  // Instead, initialize in some sort of static initializer for the
@@ -543,6 +555,9 @@ void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
    finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
    return;
  }
+  if (isVectorType(Ty)) {
+    InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
+  }
  Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
  InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
  if (Arg->hasReg()) {
@@ -570,7 +585,6 @@ void TargetX8632::addProlog(CfgNode *Node) {
  // or B.
  const bool SimpleCoalescing = true;
  size_t InArgsSizeBytes = 0;
-  size_t RetIpSizeBytes = 4;
  size_t PreservedRegsSizeBytes = 0;
  LocalsSizeBytes = 0;
  Context.init(Node);
@@ -657,6 +671,13 @@ void TargetX8632::addProlog(CfgNode *Node) {
    _mov(ebp, esp);
  }
+  if (NeedsStackAlignment) {
+    uint32_t StackSize = applyStackAlignment(
+        X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes + LocalsSizeBytes);
+    LocalsSizeBytes =
+        StackSize - X86_RET_IP_SIZE_BYTES - PreservedRegsSizeBytes;
+  }
  // Generate "sub esp, LocalsSizeBytes"
  if (LocalsSizeBytes)
    _sub(getPhysicalRegister(Reg_esp),
@@ -668,7 +689,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
  // for those that were register-allocated.  Args are pushed right to
  // left, so Arg[0] is closest to the stack/frame pointer.
  Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
-  size_t BasicFrameOffset = PreservedRegsSizeBytes + RetIpSizeBytes;
+  size_t BasicFrameOffset = PreservedRegsSizeBytes + X86_RET_IP_SIZE_BYTES;
  if (!IsEbpBasedFrame)
    BasicFrameOffset += LocalsSizeBytes;
@@ -959,12 +980,42 @@ llvm::SmallBitVector TargetX8632::getRegisterSet(RegSetMask Include,
 void TargetX8632::lowerAlloca(const InstAlloca *Inst) {
  IsEbpBasedFrame = true;
-  // TODO(sehr,stichnot): align allocated memory, keep stack aligned, minimize
+  // Conservatively require the stack to be aligned.  Some stack
-  // the number of adjustments of esp, etc.
+  // adjustment operations implemented below assume that the stack is
+  // aligned before the alloca.  All the alloca code ensures that the
+  // stack alignment is preserved after the alloca.  The stack alignment
+  // restriction can be relaxed in some cases.
+  NeedsStackAlignment = true;
+  // TODO(sehr,stichnot): minimize the number of adjustments of esp, etc.
  Variable *esp = getPhysicalRegister(Reg_esp);
  Operand *TotalSize = legalize(Inst->getSizeInBytes());
  Variable *Dest = Inst->getDest();
-  _sub(esp, TotalSize);
+  uint32_t AlignmentParam = Inst->getAlignInBytes();
+  // LLVM enforces power of 2 alignment.
+  assert((AlignmentParam & (AlignmentParam - 1)) == 0);
+  assert((X86_STACK_ALIGNMENT_BYTES & (X86_STACK_ALIGNMENT_BYTES - 1)) == 0);
+  uint32_t Alignment = std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES);
+  if (Alignment > X86_STACK_ALIGNMENT_BYTES) {
+    _and(esp, Ctx->getConstantInt(IceType_i32, -Alignment));
+  }
+  if (ConstantInteger *ConstantTotalSize =
+          llvm::dyn_cast<ConstantInteger>(TotalSize)) {
+    uint32_t Value = ConstantTotalSize->getValue();
+    // Round Value up to the next highest multiple of the alignment.
+    Value = (Value + Alignment - 1) & -Alignment;
+    _sub(esp, Ctx->getConstantInt(IceType_i32, Value));
+  } else {
+    // Non-constant sizes need to be adjusted to the next highest
+    // multiple of the required alignment at runtime.
+    Variable *T = makeReg(IceType_i32);
+    _mov(T, TotalSize);
+    _add(T, Ctx->getConstantInt(IceType_i32, Alignment - 1));
+    _and(T, Ctx->getConstantInt(IceType_i32, -Alignment));
+    _sub(esp, T);
+  }
  _mov(Dest, esp);
 }
@@ -1544,51 +1595,78 @@ void TargetX8632::lowerBr(const InstBr *Inst) {
 }
 void TargetX8632::lowerCall(const InstCall *Instr) {
+  // x86-32 calling convention:
+  //
+  // * At the point before the call, the stack must be aligned to 16
+  // bytes.
+  //
+  // * The first four arguments of vector type, regardless of their
+  // position relative to the other arguments in the argument list, are
+  // placed in registers xmm0 - xmm3.
+  //
+  // * Other arguments are pushed onto the stack in right-to-left order,
+  // such that the left-most argument ends up on the top of the stack at
+  // the lowest memory address.
+  //
+  // * Stack arguments of vector type are aligned to start at the next
+  // highest multiple of 16 bytes.  Other stack arguments are aligned to
+  // 4 bytes.
+  //
+  // This intends to match the section "IA-32 Function Calling
+  // Convention" of the document "OS X ABI Function Call Guide" by
+  // Apple.
+  NeedsStackAlignment = true;
+  OperandList XmmArgs;
+  OperandList StackArgs, StackArgLocations;
+  uint32_t ParameterAreaSizeBytes = 0;
  // Classify each argument operand according to the location where the
  // argument is passed.
-  OperandList XmmArgs;
-  OperandList StackArgs;
  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
    Operand *Arg = Instr->getArg(i);
-    if (isVectorType(Arg->getType()) && XmmArgs.size() < X86_MAX_XMM_ARGS) {
+    Type Ty = Arg->getType();
+    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
+    assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_i64 ||
+           Ty == IceType_f64 || isVectorType(Ty));
+    if (isVectorType(Ty) && XmmArgs.size() < X86_MAX_XMM_ARGS) {
      XmmArgs.push_back(Arg);
    } else {
      StackArgs.push_back(Arg);
+      if (isVectorType(Arg->getType())) {
+        ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
+      }
+      Variable *esp = Func->getTarget()->getPhysicalRegister(Reg_esp);
+      Constant *Loc = Ctx->getConstantInt(IceType_i32, ParameterAreaSizeBytes);
+      StackArgLocations.push_back(OperandX8632Mem::create(Func, Ty, esp, Loc));
+      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
    }
  }
-  // For stack arguments, generate a sequence of push instructions,
-  // pushing right to left, keeping track of stack offsets in case a
+  // Adjust the parameter area so that the stack is aligned.  It is
-  // push involves a stack operand and we are using an esp-based frame.
+  // assumed that the stack is already aligned at the start of the
-  uint32_t StackOffset = 0;
+  // calling sequence.
-  // TODO: Consolidate the stack adjustment for function calls by
+  ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
-  // reserving enough space for the arguments only once.
+  // Subtract the appropriate amount for the argument area.  This also
+  // takes care of setting the stack adjustment during emission.
  //
  // TODO: If for some reason the call instruction gets dead-code
  // eliminated after lowering, we would need to ensure that the
-  // pre-call push instructions and the post-call esp adjustment get
+  // pre-call and the post-call esp adjustment get eliminated as well.
-  // eliminated as well.
+  if (ParameterAreaSizeBytes) {
-  for (OperandList::reverse_iterator I = StackArgs.rbegin(),
+    _adjust_stack(ParameterAreaSizeBytes);
-           E = StackArgs.rend(); I != E; ++I) {
-    Operand *Arg = legalize(*I);
-    if (Arg->getType() == IceType_i64) {
-      _push(hiOperand(Arg));
-      _push(loOperand(Arg));
-    } else if (Arg->getType() == IceType_f64 || isVectorType(Arg->getType())) {
-      // If the Arg turns out to be a memory operand, more than one push
-      // instruction is required.  This ends up being somewhat clumsy in
-      // the current IR, so we use a workaround.  Force the operand into
-      // a (xmm) register, and then push the register.  An xmm register
-      // push is actually not possible in x86, but the Push instruction
-      // emitter handles this by decrementing the stack pointer and
-      // directly writing the xmm register value.
-      _push(legalize(Arg, Legal_Reg));
-    } else {
-      // Otherwise PNaCl requires parameter types to be at least 32-bits.
-      assert(Arg->getType() == IceType_f32 || Arg->getType() == IceType_i32);
-      _push(Arg);
  }
-    StackOffset += typeWidthInBytesOnStack(Arg->getType());
+  // Copy arguments that are passed on the stack to the appropriate
+  // stack locations.
+  for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
+    lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
+    // TODO: Consider calling postLower() here to reduce the register
+    // pressure associated with using too many infinite weight
+    // temporaries when lowering the call sequence in -Om1 mode.
  }
  // Copy arguments to be passed in registers to the appropriate
  // registers.
  // TODO: Investigate the impact of lowering arguments passed in
@@ -1652,10 +1730,11 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
  if (ReturnRegHi)
    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
-  // Add the appropriate offset to esp.
+  // Add the appropriate offset to esp.  The call instruction takes care
-  if (StackOffset) {
+  // of resetting the stack offset during emission.
+  if (ParameterAreaSizeBytes) {
    Variable *esp = Func->getTarget()->getPhysicalRegister(Reg_esp);
-    _add(esp, Ctx->getConstantInt(IceType_i32, StackOffset));
+    _add(esp, Ctx->getConstantInt(IceType_i32, ParameterAreaSizeBytes));
  }
  // Insert a register-kill pseudo instruction.
@@ -2134,9 +2213,9 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
    // Use pshufd and movd/movss.
    //
-    // ALIGNHACK: Force vector operands to registers in instructions that
+    // ALIGNHACK: Force vector operands to registers in instructions
-    // require aligned memory operands until support for stack alignment
+    // that require aligned memory operands until support for data
-    // is implemented.
+    // alignment is implemented.
 #define ALIGN_HACK(Vect) legalizeToVar((Vect))
    Operand *SourceVectRM =
        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
@@ -2221,8 +2300,8 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
      Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      // ALIGNHACK: Without support for stack alignment, both operands to
+      // ALIGNHACK: Without support for data alignment, both operands to
-      // cmpps need to be forced into registers.  Once support for stack
+      // cmpps need to be forced into registers.  Once support for data
      // alignment is implemented, remove LEGAL_HACK.
 #define LEGAL_HACK(Vect) legalizeToVar((Vect))
      switch (Condition) {
@@ -2362,8 +2441,8 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
    }
    // TODO: ALIGNHACK: Both operands to compare instructions need to be
-    // in registers until stack alignment support is implemented.  Once
+    // in registers until data alignment support is implemented.  Once
-    // there is support for stack alignment, LEGAL_HACK can be removed.
+    // there is support for data alignment, LEGAL_HACK can be removed.
 #define LEGAL_HACK(Vect) legalizeToVar((Vect))
    Variable *T = makeReg(Ty);
    switch (Condition) {
@@ -2583,9 +2662,9 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
    Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index - 1]);
    Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index - 1]);
-    // ALIGNHACK: Force vector operands to registers in instructions that
+    // ALIGNHACK: Force vector operands to registers in instructions
-    // require aligned memory operands until support for stack alignment
+    // that require aligned memory operands until support for data
-    // is implemented.
+    // alignment is implemented.
 #define ALIGN_HACK(Vect) legalizeToVar((Vect))
    if (Index == 1) {
      SourceVectRM = ALIGN_HACK(SourceVectRM);
@@ -2873,7 +2952,8 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
  }
  case Intrinsics::Memset: {
    // The value operand needs to be extended to a stack slot size
-    // because "push" only works for a specific operand size.
+    // because the PNaCl ABI requires arguments to be at least 32 bits
+    // wide.
    Operand *ValOp = Instr->getArg(1);
    assert(ValOp->getType() == IceType_i8);
    Variable *ValExt = Func->makeVariable(stackSlotType(), Context.getNode());
@@ -3560,9 +3640,9 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
    Variable *T = makeReg(SrcTy);
    Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
    Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
-    // ALIGNHACK: Until stack alignment support is implemented, vector
+    // ALIGNHACK: Until data alignment support is implemented, vector
    // instructions need to have vector operands in registers.  Once
-    // there is support for stack alignment, LEGAL_HACK can be removed.
+    // there is support for data alignment, LEGAL_HACK can be removed.
 #define LEGAL_HACK(Vect) legalizeToVar((Vect))
    if (InstructionSet >= SSE4_1) {
      // TODO(wala): If the condition operand is a constant, use blendps
@@ -3657,13 +3737,16 @@ void TargetX8632::lowerStore(const InstStore *Inst) {
  Operand *Value = Inst->getData();
  Operand *Addr = Inst->getAddr();
  OperandX8632Mem *NewAddr = FormMemoryOperand(Addr, Value->getType());
+  Type Ty = NewAddr->getType();
-  if (NewAddr->getType() == IceType_i64) {
+  if (Ty == IceType_i64) {
    Value = legalize(Value);
    Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm, true);
    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm, true);
    _store(ValueHi, llvm::cast<OperandX8632Mem>(hiOperand(NewAddr)));
    _store(ValueLo, llvm::cast<OperandX8632Mem>(loOperand(NewAddr)));
+  } else if (isVectorType(Ty)) {
+    _storep(legalizeToVar(Value), NewAddr);
  } else {
    Value = legalize(Value, Legal_Reg | Legal_Imm, true);
    _store(Value, NewAddr);
@@ -4039,9 +4122,9 @@ void TargetX8632::postLower() {
        llvm::SmallBitVector AvailableTypedRegisters =
            AvailableRegisters & getRegisterSetForType(Var->getType());
        if (!AvailableTypedRegisters.any()) {
-          // This is a hack in case we run out of physical registers
+          // This is a hack in case we run out of physical registers due
-          // due to an excessive number of "push" instructions from
+          // to an excessively long code sequence, as might happen when
-          // lowering a call.
+          // lowering arguments in lowerCall().
          AvailableRegisters = WhiteList;
          AvailableTypedRegisters =
              AvailableRegisters & getRegisterSetForType(Var->getType());

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -188,6 +188,9 @@ protected:
  void _add(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Add::create(Func, Dest, Src0));
  }
+  void _adjust_stack(int32_t Amount) {
+    Context.insert(InstX8632AdjustStack::create(Func, Amount));
+  }
  void _addps(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Addps::create(Func, Dest, Src0));
  }
@@ -412,6 +415,9 @@ protected:
  void _store(Operand *Value, OperandX8632 *Mem) {
    Context.insert(InstX8632Store::create(Func, Value, Mem));
  }
+  void _storep(Operand *Value, OperandX8632 *Mem) {
+    Context.insert(InstX8632StoreP::create(Func, Value, Mem));
+  }
  void _storeq(Operand *Value, OperandX8632 *Mem) {
    Context.insert(InstX8632StoreQ::create(Func, Value, Mem));
  }
@@ -450,6 +456,7 @@ protected:
  const X86InstructionSet InstructionSet;
  bool IsEbpBasedFrame;
+  bool NeedsStackAlignment;
  size_t FrameSizeLocals;
  size_t LocalsSizeBytes;
  llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];

--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -33,40 +33,50 @@ entry:
  ret i32 %add3
 }
 ; CHECK: pass64BitArg:
-; CHECK:      push    123
+; CHECK:      sub     esp
-; CHECK-NEXT: push
+; CHECK:      mov     dword ptr [esp+4]
-; CHECK-NEXT: push
+; CHECK:      mov     dword ptr [esp]
-; CHECK-NEXT: call    ignore64BitArgNoInline
+; CHECK:      mov     dword ptr [esp+8], 123
-; CHECK:      push
+; CHECK:      mov     dword ptr [esp+16]
-; CHECK-NEXT: push
+; CHECK:      mov     dword ptr [esp+12]
-; CHECK-NEXT: push    123
+; CHECK:      call    ignore64BitArgNoInline
-; CHECK-NEXT: push
+; CHECK       sub     esp
-; CHECK-NEXT: push
+; CHECK:      mov     dword ptr [esp+4]
-; CHECK-NEXT: call    ignore64BitArgNoInline
+; CHECK:      mov     dword ptr [esp]
-; CHECK:      push
+; CHECK:      mov     dword ptr [esp+8], 123
-; CHECK-NEXT: push
+; CHECK:      mov     dword ptr [esp+16]
-; CHECK-NEXT: push    123
+; CHECK:      mov     dword ptr [esp+12]
-; CHECK-NEXT: push
+; CHECK:      call    ignore64BitArgNoInline
-; CHECK-NEXT: push
+; CHECK:      sub     esp
-; CHECK-NEXT: call    ignore64BitArgNoInline
+; CHECK:      mov     dword ptr [esp+4]
+; CHECK:      mov     dword ptr [esp]
+; CHECK:      mov     dword ptr [esp+8], 123
+; CHECK:      mov     dword ptr [esp+16]
+; CHECK:      mov     dword ptr [esp+12]
+; CHECK:      call    ignore64BitArgNoInline
 ;
 ; OPTM1: pass64BitArg:
-; OPTM1:      push    123
+; OPTM1:      sub     esp
-; OPTM1-NEXT: push
+; OPTM1:      mov     dword ptr [esp+4]
-; OPTM1-NEXT: push
+; OPTM1:      mov     dword ptr [esp]
-; OPTM1-NEXT: call    ignore64BitArgNoInline
+; OPTM1:      mov     dword ptr [esp+8], 123
-; OPTM1:      push
+; OPTM1:      mov     dword ptr [esp+16]
-; OPTM1-NEXT: push
+; OPTM1:      mov     dword ptr [esp+12]
-; OPTM1-NEXT: push    123
+; OPTM1:      call    ignore64BitArgNoInline
-; OPTM1-NEXT: push
+; OPTM1       sub     esp
-; OPTM1-NEXT: push
+; OPTM1:      mov     dword ptr [esp+4]
-; OPTM1-NEXT: call    ignore64BitArgNoInline
+; OPTM1:      mov     dword ptr [esp]
-; OPTM1:      push
+; OPTM1:      mov     dword ptr [esp+8], 123
-; OPTM1-NEXT: push
+; OPTM1:      mov     dword ptr [esp+16]
-; OPTM1-NEXT: push    123
+; OPTM1:      mov     dword ptr [esp+12]
-; OPTM1-NEXT: push
+; OPTM1:      call    ignore64BitArgNoInline
-; OPTM1-NEXT: push
+; OPTM1:      sub     esp
-; OPTM1-NEXT: call    ignore64BitArgNoInline
+; OPTM1:      mov     dword ptr [esp+4]
+; OPTM1:      mov     dword ptr [esp]
+; OPTM1:      mov     dword ptr [esp+8], 123
+; OPTM1:      mov     dword ptr [esp+16]
+; OPTM1:      mov     dword ptr [esp+12]
+; OPTM1:      call    ignore64BitArgNoInline
 declare i32 @ignore64BitArgNoInline(i64, i32, i64)
@@ -76,19 +86,21 @@ entry:
  ret i32 %call
 }
 ; CHECK: pass64BitConstArg:
-; CHECK:      push    3735928559
+; CHECK:      sub     esp
-; CHECK-NEXT: push    305419896
+; CHECK:      mov     dword ptr [esp+4]
-; CHECK-NEXT: push    123
+; CHECK-NEXT: mov     dword ptr [esp]
-; CHECK-NEXT: push    ecx
+; CHECK-NEXT: mov     dword ptr [esp+8], 123
-; CHECK-NEXT: push    eax
+; CHECK-NEXT: mov     dword ptr [esp+16], 3735928559
+; CHECK-NEXT: mov     dword ptr [esp+12], 305419896
 ; CHECK-NEXT: call    ignore64BitArgNoInline
 ;
 ; OPTM1: pass64BitConstArg:
-; OPTM1:      push    3735928559
+; OPTM1:      sub     esp
-; OPTM1-NEXT: push    305419896
+; OPTM1:      mov     dword ptr [esp+4]
-; OPTM1-NEXT: push    123
+; OPTM1-NEXT: mov     dword ptr [esp]
-; OPTM1-NEXT: push    dword ptr [
+; OPTM1-NEXT: mov     dword ptr [esp+8], 123
-; OPTM1-NEXT: push    dword ptr [
+; OPTM1-NEXT: mov     dword ptr [esp+16], 3735928559
+; OPTM1-NEXT: mov     dword ptr [esp+12], 305419896
 ; OPTM1-NEXT: call    ignore64BitArgNoInline
 define internal i64 @return64BitArg(i64 %a) {
@@ -240,14 +252,14 @@ entry:
  ret i64 %div
 }
 ; CHECK-LABEL: div64BitSignedConst:
-; CHECK: push 2874
+; CHECK: mov     dword ptr [esp+12], 2874
-; CHECK: push 1942892530
+; CHECK: mov     dword ptr [esp+8],  1942892530
 ; CHECK: call    __divdi3
 ; CHECK: ret
 ;
 ; OPTM1-LABEL: div64BitSignedConst:
-; OPTM1: push 2874
+; OPTM1: mov     dword ptr [esp+12], 2874
-; OPTM1: push 1942892530
+; OPTM1: mov     dword ptr [esp+8],  1942892530
 ; OPTM1: call    __divdi3
 ; OPTM1: ret

--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
-; This is a basic test of the alloca instruction - one test for alloca
+; This is a basic test of the alloca instruction.
-; of a fixed size, and one test for variable size.
 ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
-; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -O2 --verbose none %s \
 ; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
 ; RUN: %llvm2ice -Om1 --verbose none %s \
@@ -12,45 +11,95 @@
 ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
 ; RUN:                           | FileCheck --check-prefix=DUMP %s
-define void @fixed_400(i32 %n) {
+define void @fixed_416_align_16(i32 %n) {
 entry:
-  %array = alloca i8, i32 400, align 16
+  %array = alloca i8, i32 416, align 16
  %__2 = ptrtoint i8* %array to i32
  call void @f1(i32 %__2)
  ret void
 }
-; CHECK: fixed_400:
+; CHECK-LABEL: fixed_416_align_16:
-; CHECK:      sub     esp, 400
+; CHECK:      sub     esp, 416
-; CHECK-NEXT: mov     eax, esp
+; CHECK:      sub     esp, 16
-; CHECK-NEXT: push    eax
+; CHECK:      mov     dword ptr [esp], eax
-; CHECK-NEXT: call    f1
+; CHECK:      call    f1
-;
-; OPTM1: fixed_400:
+define void @fixed_416_align_32(i32 %n) {
-; OPTM1:      sub     esp, 400
+entry:
-; OPTM1-NEXT: mov     {{.*}}, esp
+  %array = alloca i8, i32 400, align 32
-; OPTM1:      push
+  %__2 = ptrtoint i8* %array to i32
-; OPTM1-NEXT: call    f1
+  call void @f1(i32 %__2)
+  ret void
+}
+; CHECK-LABEL: fixed_416_align_32:
+; CHECK:      and     esp, 4294967264
+; CHECK:      sub     esp, 416
+; CHECK:      sub     esp, 16
+; CHECK:      mov     dword ptr [esp], eax
+; CHECK:      call    f1
+define void @fixed_351_align_16(i32 %n) {
+entry:
+  %array = alloca i8, i32 351, align 16
+  %__2 = ptrtoint i8* %array to i32
+  call void @f1(i32 %__2)
+  ret void
+}
+; CHECK-LABEL: fixed_351_align_16:
+; CHECK:      sub     esp, 352
+; CHECK:      sub     esp, 16
+; CHECK:      mov     dword ptr [esp], eax
+; CHECK:      call    f1
+define void @fixed_351_align_32(i32 %n) {
+entry:
+  %array = alloca i8, i32 351, align 32
+  %__2 = ptrtoint i8* %array to i32
+  call void @f1(i32 %__2)
+  ret void
+}
+; CHECK-LABEL: fixed_351_align_32:
+; CHECK:      and     esp, 4294967264
+; CHECK:      sub     esp, 352
+; CHECK:      sub     esp, 16
+; CHECK:      mov     dword ptr [esp], eax
+; CHECK:      call    f1
 declare void @f1(i32)
-define void @variable_n(i32 %n) {
+define void @variable_n_align_16(i32 %n) {
 entry:
  %array = alloca i8, i32 %n, align 16
  %__2 = ptrtoint i8* %array to i32
  call void @f2(i32 %__2)
  ret void
 }
-; CHECK: variable_n:
+; CHECK-LABEL: variable_n_align_16:
 ; CHECK:      mov     eax, dword ptr [ebp+8]
-; CHECK-NEXT: sub     esp, eax
+; CHECK:      add     eax, 15
-; CHECK-NEXT: mov     eax, esp
+; CHECK:      and     eax, 4294967280
-; CHECK-NEXT: push    eax
+; CHECK:      sub     esp, eax
-; CHECK-NEXT: call    f2
+; CHECK:      sub     esp, 16
-;
+; CHECK:      mov     dword ptr [esp], eax
-; OPTM1: variable_n:
+; CHECK:      call    f2
-; OPTM1:      mov     {{.*}}, esp
-; OPTM1:      push
+define void @variable_n_align_32(i32 %n) {
-; OPTM1-NEXT: call    f2
+entry:
+  %array = alloca i8, i32 %n, align 32
+  %__2 = ptrtoint i8* %array to i32
+  call void @f2(i32 %__2)
+  ret void
+}
+; In -O2, the order of the CHECK-DAG lines in the output is switched.
+; CHECK-LABEL: variable_n_align_32:
+; CHECK-DAG:  and     esp, 4294967264
+; CHECK-DAG:  mov     eax, dword ptr [ebp+8]
+; CHECK:      add     eax, 31
+; CHECK:      and     eax, 4294967264
+; CHECK:      sub     esp, eax
+; CHECK:      sub     esp, 16
+; CHECK:      mov     dword ptr [esp], eax
+; CHECK:      call    f2
 declare void @f2(i32)

--- a/tests_lit/llvm2ice_tests/ebp_args.ll
+++ b/tests_lit/llvm2ice_tests/ebp_args.ll
@@ -22,10 +22,11 @@ entry:
 ; lowering code changes.
 ; CHECK: memcpy_helper:
+; CHECK:  push     ebx
 ; CHECK:  push     ebp
 ; CHECK:  mov      ebp, esp
 ; CHECK:  sub      esp, 20
-; CHECK: mov      eax, dword ptr [ebp+12]
+; CHECK:  mov      eax, dword ptr [ebp+16]
 ; CHECK:  mov      dword ptr [ebp-4], eax
 ; CHECK:  sub      esp, 128
 ; CHECK:  mov      dword ptr [ebp-8], esp
@@ -33,7 +34,11 @@ entry:
 ; CHECK:  mov      dword ptr [ebp-12], eax
 ; CHECK:  movzx    eax, byte ptr [ebp-4]
 ; CHECK:  mov      dword ptr [ebp-16], eax
-; CHECK: push     dword ptr [ebp-16]
+; CHECK:  sub      esp, 16
-; CHECK: push     dword ptr [ebp-12]
+; CHECK:  mov      ecx, dword ptr [ebp+12]
-; CHECK: push     dword ptr [ebp+8]
+; CHECK:  mov      dword ptr [esp], ecx
+; CHECK:  mov      edx, dword ptr [ebp-12]
+; CHECK:  mov      dword ptr [esp+4], edx
+; CHECK:  mov      ebx, dword ptr [ebp-16]
+; CHECK:  mov      dword ptr [esp+8], ebx
 ; CHECK:  call     memcpy_helper2
--- a/tests_lit/llvm2ice_tests/fp.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/fp.pnacl.ll
@@ -45,11 +45,11 @@ entry:
  ret i32 %add3
 }
 ; CHECK-LABEL: passFpArgs
-; CHECK: push 123
+; CHECK: mov dword ptr [esp+4], 123
 ; CHECK: call ignoreFpArgsNoInline
-; CHECK: push 123
+; CHECK: mov dword ptr [esp+4], 123
 ; CHECK: call ignoreFpArgsNoInline
-; CHECK: push 123
+; CHECK: mov dword ptr [esp+4], 123
 ; CHECK: call ignoreFpArgsNoInline
 declare i32 @ignoreFpArgsNoInline(float, i32, double)
@@ -60,7 +60,7 @@ entry:
  ret i32 %call
 }
 ; CHECK-LABEL: passFpConstArg
-; CHECK: push 123
+; CHECK: mov dword ptr [esp+4], 123
 ; CHECK: call ignoreFpArgsNoInline
 define internal i32 @passFp32ConstArg(float %a) {
@@ -69,8 +69,8 @@ entry:
  ret i32 %call
 }
 ; CHECK-LABEL: passFp32ConstArg
-; CHECK: push dword
+; CHECK: mov dword ptr [esp+4], 123
-; CHECK: push 123
+; CHECK: movss dword ptr [esp+8]
 ; CHECK: call ignoreFp32ArgsNoInline
 declare i32 @ignoreFp32ArgsNoInline(float, i32, float)
@@ -415,8 +415,8 @@ entry:
  ret double %conv
 }
 ; CHECK-LABEL: unsigned64ToDouble
-; CHECK: push 2874
+; CHECK: mov dword ptr [esp+4], 2874
-; CHECK: push 1942892530
+; CHECK: mov dword ptr [esp], 1942892530
 ; CHECK: call cvtui64tod
 ; CHECK: fstp

--- a/tests_lit/llvm2ice_tests/undef.ll
+++ b/tests_lit/llvm2ice_tests/undef.ll
@@ -37,8 +37,7 @@ define float @undef_float() {
 entry:
  ret float undef
 ; CHECK-LABEL: undef_float:
-; CHECK-NOT: sub esp
+; CHECK: [L$float$
-; CHECK: fld
 }
 define <4 x i1> @undef_v4i1() {

--- a/tests_lit/llvm2ice_tests/vector-arg.ll
+++ b/tests_lit/llvm2ice_tests/vector-arg.ll
 ; This file checks that Subzero generates code in accordance with the
 ; calling convention for vectors.
-; NOTE: CHECK / OPTM1 lines containing the following strings may be
-; subject to change:
-;
-; * movups: The movups instruction may be changed to movaps when the
-; load / store operation is 16 byte aligned.
-;
-; * stack offsets: These may need to be changed if stack alignment
-; support is implemented.
-;
-; * stack adjustment operations
 ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
 ; RUN: %llvm2ice -O2 --verbose none %s \
@@ -150,7 +139,7 @@ define <4 x float> @test_returning_interspersed_arg4(i32 %i32arg0, double %doubl
 entry:
  ret <4 x float> %arg4
 ; CHECK-LABEL: test_returning_interspersed_arg4:
-; CHECK: movups xmm0, xmmword ptr [esp+44]
+; CHECK: movups xmm0, xmmword ptr [esp+52]
 ; CHECK: ret
 ; OPTM1-LABEL: test_returning_interspersed_arg4:
@@ -172,33 +161,69 @@ entry:
  call void @VectorArgs(<4 x float> %arg9, <4 x float> %arg8, <4 x float> %arg7, <4 x float> %arg6, <4 x float> %arg5, <4 x float> %arg4)
  ret void
 ; CHECK-LABEL: test_passing_vectors:
-; CHECK: movups  [[ARG6:.*]], xmmword ptr [esp+4]
+; CHECK: sub esp, 32
-; CHECK: sub esp, 16
+; CHECK: movups  [[ARG5:.*]], xmmword ptr [esp+64]
-; CHECK-NEXT: movups xmmword ptr [esp], [[ARG6]]
+; CHECK: movups  xmmword ptr [esp], [[ARG5]]
-; CHECK: movups  [[ARG5:.*]], xmmword ptr [esp+36]
+; CHECK: movups  [[ARG6:.*]], xmmword ptr [esp+48]
-; CHECK: sub esp, 16
+; CHECK: movups  xmmword ptr [esp+16], [[ARG6]]
-; CHECK-NEXT: movups xmmword ptr [esp], [[ARG5]]
+; CHECK: movups  xmm0, xmmword ptr [esp+128]
-; CHECK: movups  xmm0, xmmword ptr [esp+116]
+; CHECK: movups  xmm1, xmmword ptr [esp+112]
-; CHECK: movups  xmm1, xmmword ptr [esp+100]
+; CHECK: movups  xmm2, xmmword ptr [esp+96]
-; CHECK: movups  xmm2, xmmword ptr [esp+84]
+; CHECK: movups  xmm3, xmmword ptr [esp+80]
-; CHECK: movups  xmm3, xmmword ptr [esp+68]
 ; CHECK: call VectorArgs
 ; CHECK-NEXT: add esp, 32
 ; CHECK: ret
 ; OPTM1-LABEL: test_passing_vectors:
-; OPTM1: movups  [[ARG6:.*]], xmmword ptr {{.*}}
+; OPTM1: sub esp, 32
-; OPTM1: sub esp, 16
-; OPTM1: movups xmmword ptr [esp], [[ARG6]]
 ; OPTM1: movups  [[ARG5:.*]], xmmword ptr {{.*}}
-; OPTM1: sub esp, 16
+; OPTM1: movups  xmmword ptr [esp], [[ARG5]]
-; OPTM1-NEXT: movups xmmword ptr [esp], [[ARG5]]
+; OPTM1: movups  [[ARG6:.*]], xmmword ptr {{.*}}
+; OPTM1: movups  xmmword ptr [esp+16], [[ARG6]]
 ; OPTM1: movups  xmm0, xmmword ptr {{.*}}
 ; OPTM1: movups  xmm1, xmmword ptr {{.*}}
 ; OPTM1: movups  xmm2, xmmword ptr {{.*}}
 ; OPTM1: movups  xmm3, xmmword ptr {{.*}}
 ; OPTM1: call VectorArgs
-; OPTM1: add esp, 32
+; OPTM1-NEXT: add esp, 32
+; OPTM1: ret
+}
+declare void @InterspersedVectorArgs(<4 x float>, i64, <4 x float>, i64, <4 x float>, float, <4 x float>, double, <4 x float>, i32, <4 x float>)
+define void @test_passing_vectors_interspersed(<4 x float> %arg0, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3, <4 x float> %arg4, <4 x float> %arg5, <4 x float> %arg6, <4 x float> %arg7, <4 x float> %arg8, <4 x float> %arg9) {
+entry:
+  ; Kills XMM registers so that no in-arg lowering code interferes
+  ; with the test.
+  call void @killXmmRegisters()
+  call void @InterspersedVectorArgs(<4 x float> %arg9, i64 0, <4 x float> %arg8, i64 1, <4 x float> %arg7, float 2.000000e+00, <4 x float> %arg6, double 3.000000e+00, <4 x float> %arg5, i32 4, <4 x float> %arg4)
+  ret void
+; CHECK-LABEL: test_passing_vectors_interspersed:
+; CHECK: sub esp, 80
+; CHECK: movups  [[ARG9:.*]], xmmword ptr [esp+112]
+; CHECK: movups  xmmword ptr [esp+32], [[ARG9]]
+; CHECK: movups  [[ARG11:.*]], xmmword ptr [esp+96]
+; CHECK: movups  xmmword ptr [esp+64], [[ARG11]]
+; CHECK: movups  xmm0, xmmword ptr [esp+176]
+; CHECK: movups  xmm1, xmmword ptr [esp+160]
+; CHECK: movups  xmm2, xmmword ptr [esp+144]
+; CHECK: movups  xmm3, xmmword ptr [esp+128]
+; CHECK: call InterspersedVectorArgs
+; CHECK-NEXT: add esp, 80
+; CHECK: ret
+; OPTM1-LABEL: test_passing_vectors_interspersed:
+; OPTM1: sub esp, 80
+; OPTM1: movups  [[ARG9:.*]], xmmword ptr {{.*}}
+; OPTM1: movups  xmmword ptr [esp+32], [[ARG9]]
+; OPTM1: movups  [[ARG11:.*]], xmmword ptr {{.*}}
+; OPTM1: movups  xmmword ptr [esp+64], [[ARG11]]
+; OPTM1: movups  xmm0, xmmword ptr {{.*}}
+; OPTM1: movups  xmm1, xmmword ptr {{.*}}
+; OPTM1: movups  xmm2, xmmword ptr {{.*}}
+; OPTM1: movups  xmm3, xmmword ptr {{.*}}
+; OPTM1: call InterspersedVectorArgs
+; OPTM1-NEXT: add esp, 80
 ; OPTM1: ret
 }
@@ -220,8 +245,8 @@ entry:
 ; OPTM1-LABEL: test_receiving_vectors:
 ; OPTM1: call VectorReturn
-; OPTM1: movups [[LOC:.*]], xmm0
+; OPTM1: movups {{.*}}, xmm0
-; OPTM1: movups xmm0, [[LOC]]
+; OPTM1: movups xmm0, {{.*}}
 ; OPTM1: call VectorReturn
 ; OPTM1: ret
 }