1) Move helper creation to separate method, which also computes the maximum…

1) Move helper creation to separate method, which also computes the maximum outgoing argument size. The computed size is checked against call lowering's size. 2) Make addProlog use the outgoing argument size and remove the adjustments in lowerCall. 3) Remove AdjustStack instructions and friends. BUG= R=jpp@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/1472623002 .

1) Move helper creation to separate method, which also computes the maximum…
26217e33 · David Sehr · 83f957a1 · 26217e33 · 26217e33 · 26217e33
Commit 26217e33 authored Nov 26, 2015 by David Sehr
30 changed files
--- a/src/IceAssemblerARM32.cpp
+++ b/src/IceAssemblerARM32.cpp
@@ -296,11 +296,8 @@ DecodedResult decodeAddress(const Operand *Opnd, IValueT &Value,
    if (!Utils::IsAbsoluteUint(12, Offset))
      return CantDecode;
    int32_t BaseRegNum = Var->getBaseRegNum();
-    if (BaseRegNum == Variable::NoRegister) {
+    if (BaseRegNum == Variable::NoRegister)
      BaseRegNum = TInfo.FrameOrStackReg;
-      if (!TInfo.HasFramePointer)
-        Offset += TInfo.StackAdjustment;
-    }
    Value = decodeImmRegOffset(decodeGPRRegister(BaseRegNum), Offset,
                               OperandARM32Mem::Offset);
    return DecodedAsImmRegOffset;

--- a/src/IceAssemblerARM32.h
+++ b/src/IceAssemblerARM32.h
@@ -82,17 +82,13 @@ public:
    TargetInfo &operator=(const TargetInfo &) = delete;

  public:
-    TargetInfo(bool HasFramePointer, SizeT FrameOrStackReg,
-               int32_t StackAdjustment)
-        : HasFramePointer(HasFramePointer), FrameOrStackReg(FrameOrStackReg),
-          StackAdjustment(StackAdjustment) {}
+    TargetInfo(bool HasFramePointer, SizeT FrameOrStackReg)
+        : HasFramePointer(HasFramePointer), FrameOrStackReg(FrameOrStackReg) {}
    explicit TargetInfo(const TargetLowering *Target)
        : HasFramePointer(Target->hasFramePointer()),
-          FrameOrStackReg(Target->getFrameOrStackReg()),
-          StackAdjustment(Target->getStackAdjustment()) {}
+          FrameOrStackReg(Target->getFrameOrStackReg()) {}
    const bool HasFramePointer;
    const SizeT FrameOrStackReg;
-    const int32_t StackAdjustment;
  };

  explicit AssemblerARM32(bool use_far_branches = false)

--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -1092,9 +1092,8 @@ class BundleEmitHelper {
  BundleEmitHelper &operator=(const BundleEmitHelper &) = delete;

 public:
-  BundleEmitHelper(Assembler *Asm, TargetLowering *Target,
-                   const InstList &Insts)
-      : Asm(Asm), Target(Target), End(Insts.end()), BundleLockStart(End),
+  BundleEmitHelper(Assembler *Asm, const InstList &Insts)
+      : Asm(Asm), End(Insts.end()), BundleLockStart(End),
        BundleSize(1 << Asm->getBundleAlignLog2Bytes()),
        BundleMaskLo(BundleSize - 1), BundleMaskHi(~BundleMaskLo) {}
  // Check whether we're currently within a bundle_lock region.
@@ -1136,7 +1135,6 @@ public:
    BundleLockStart = I;
    SizeSnapshotPre = Asm->getBufferSize();
    Asm->setPreliminary(true);
-    Target->snapshotEmitState();
    assert(isInBundleLockRegion());
  }
  // Update bookkeeping when the bundle_unlock instruction is processed.
@@ -1176,12 +1174,10 @@ public:
    assert(isInBundleLockRegion());
    Asm->setBufferSize(SizeSnapshotPre);
    Asm->setPreliminary(false);
-    Target->rollbackEmitState();
  }

 private:
  Assembler *const Asm;
-  TargetLowering *const Target;
  // End is a sentinel value such that BundleLockStart==End implies that we are
  // not in a bundle_lock region.
  const InstList::const_iterator End;
@@ -1236,7 +1232,7 @@ void CfgNode::emitIAS(Cfg *Func) const {
  // bindings, label links, and relocation fixups. Instead, the first pass just
  // disables all mutation of that state.

-  BundleEmitHelper Helper(Asm, Func->getTarget(), Insts);
+  BundleEmitHelper Helper(Asm, Insts);
  InstList::const_iterator End = Insts.end();
  // Retrying indicates that we had to roll back to the bundle_lock instruction
  // to apply padding before the bundle_lock sequence.

--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -975,7 +975,6 @@ void InstARM32Call::emit(const Cfg *Func) const {
        << "\t";
    getCallTarget()->emit(Func);
  }
-  Func->getTarget()->resetStackAdjustment();
 }

 void InstARM32Call::emitIAS(const Cfg *Func) const {
@@ -995,7 +994,6 @@ void InstARM32Call::emitIAS(const Cfg *Func) const {
  }
  if (Asm->needsTextFixup())
    return emitUsingTextFixup(Func);
-  Func->getTarget()->resetStackAdjustment();
 }

 void InstARM32Call::dump(const Cfg *Func) const {

--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -99,17 +99,13 @@ MachineTraits<TargetX8632>::X86OperandMem::X86OperandMem(
 }

 namespace {
-static int32_t GetRematerializableOffset(Variable *Var, bool IgnoreStackAdjust,
+static int32_t GetRematerializableOffset(Variable *Var,
                                         const Ice::TargetX8632 *Target) {
-  int32_t Disp = 0;
-  Disp += Var->getStackOffset();
+  int32_t Disp = Var->getStackOffset();
  SizeT RegNum = static_cast<SizeT>(Var->getRegNum());
-  if (RegNum == Target->getStackReg()) {
-    if (!IgnoreStackAdjust)
-      Disp += Target->getStackAdjustment();
-  } else if (RegNum == Target->getFrameReg()) {
+  if (RegNum == Target->getFrameReg()) {
    Disp += Target->getFrameFixedAllocaOffset();
-  } else {
+  } else if (RegNum != Target->getStackReg()) {
    llvm::report_fatal_error("Unexpected rematerializable register type");
  }
  return Disp;
@@ -124,8 +120,7 @@ void MachineTraits<TargetX8632>::X86OperandMem::emit(const Cfg *Func) const {
  // physical register (esp or ebp), and update the Offset.
  int32_t Disp = 0;
  if (getBase() && getBase()->isRematerializable()) {
-    Disp +=
-        GetRematerializableOffset(getBase(), getIgnoreStackAdjust(), Target);
+    Disp += GetRematerializableOffset(getBase(), Target);
  }
  // The index should never be rematerializable.  But if we ever allow it, then
  // we should make sure the rematerialization offset is shifted by the Shift
@@ -184,8 +179,7 @@ void MachineTraits<TargetX8632>::X86OperandMem::dump(const Cfg *Func,
  int32_t Disp = 0;
  const auto *Target = static_cast<const Ice::TargetX8632 *>(Func->getTarget());
  if (getBase() && getBase()->isRematerializable()) {
-    Disp +=
-        GetRematerializableOffset(getBase(), getIgnoreStackAdjust(), Target);
+    Disp += GetRematerializableOffset(getBase(), Target);
  }
  if (getBase()) {
    if (Func)
@@ -251,8 +245,7 @@ MachineTraits<TargetX8632>::X86OperandMem::toAsmAddress(
  int32_t Disp = 0;
  const auto *Target = static_cast<const Ice::TargetX8632 *>(TargetLowering);
  if (getBase() && getBase()->isRematerializable()) {
-    Disp +=
-        GetRematerializableOffset(getBase(), getIgnoreStackAdjust(), Target);
+    Disp += GetRematerializableOffset(getBase(), Target);
  }
  // The index should never be rematerializable.  But if we ever allow it, then
  // we should make sure the rematerialization offset is shifted by the Shift
@@ -295,8 +288,7 @@ MachineTraits<TargetX8632>::Address
 MachineTraits<TargetX8632>::VariableSplit::toAsmAddress(const Cfg *Func) const {
  assert(!Var->hasReg());
  const ::Ice::TargetLowering *Target = Func->getTarget();
-  int32_t Offset =
-      Var->getStackOffset() + Target->getStackAdjustment() + getOffset();
+  int32_t Offset = Var->getStackOffset() + getOffset();
  return X8632::Traits::Address(getEncodedGPR(Target->getFrameOrStackReg()),
                                Offset, AssemblerFixup::NoFixup);
 }
@@ -309,8 +301,7 @@ void MachineTraits<TargetX8632>::VariableSplit::emit(const Cfg *Func) const {
  // The following is copied/adapted from TargetX8632::emitVariable().
  const ::Ice::TargetLowering *Target = Func->getTarget();
  constexpr Type Ty = IceType_i32;
-  int32_t Offset =
-      Var->getStackOffset() + Target->getStackAdjustment() + getOffset();
+  int32_t Offset = Var->getStackOffset() + getOffset();
  if (Offset)
    Str << Offset;
  Str << "(%" << Target->getRegName(Target->getFrameOrStackReg(), Ty) << ")";

--- a/src/IceInstX8664.cpp
+++ b/src/IceInstX8664.cpp
@@ -215,8 +215,7 @@ MachineTraits<TargetX8664>::Address
 MachineTraits<TargetX8664>::VariableSplit::toAsmAddress(const Cfg *Func) const {
  assert(!Var->hasReg());
  const ::Ice::TargetLowering *Target = Func->getTarget();
-  int32_t Offset =
-      Var->getStackOffset() + Target->getStackAdjustment() + getOffset();
+  int32_t Offset = Var->getStackOffset() + getOffset();
  return X8664::Traits::Address(getEncodedGPR(Target->getFrameOrStackReg()),
                                Offset, AssemblerFixup::NoFixup);
 }
@@ -229,8 +228,7 @@ void MachineTraits<TargetX8664>::VariableSplit::emit(const Cfg *Func) const {
  // The following is copied/adapted from TargetX8664::emitVariable().
  const ::Ice::TargetLowering *Target = Func->getTarget();
  constexpr Type Ty = IceType_i32;
-  int32_t Offset =
-      Var->getStackOffset() + Target->getStackAdjustment() + getOffset();
+  int32_t Offset = Var->getStackOffset() + getOffset();
  if (Offset)
    Str << Offset;
  Str << "(%" << Target->getRegName(Target->getFrameOrStackReg(), Ty) << ")";

--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -405,33 +405,6 @@ private:
  InstX86Jmp(Cfg *Func, Operand *Target);
 };

-/// AdjustStack instruction - grows the stack (moves esp down) by the given
-/// amount.  If the amount is negative, it shrinks the stack (moves esp up).
-/// It also updates the target lowering StackAdjustment during code emission.
-template <class Machine>
-class InstX86AdjustStack final : public InstX86Base<Machine> {
-  InstX86AdjustStack() = delete;
-  InstX86AdjustStack(const InstX86AdjustStack &) = delete;
-  InstX86AdjustStack &operator=(const InstX86AdjustStack &) = delete;
-
-public:
-  static InstX86AdjustStack *create(Cfg *Func, int32_t Amount, Variable *Esp) {
-    return new (Func->allocate<InstX86AdjustStack>())
-        InstX86AdjustStack(Func, Amount, Esp);
-  }
-  void emit(const Cfg *Func) const override;
-  void emitIAS(const Cfg *Func) const override;
-  void dump(const Cfg *Func) const override;
-  static bool classof(const Inst *Inst) {
-    return InstX86Base<Machine>::isClassof(Inst,
-                                           InstX86Base<Machine>::Adjuststack);
-  }
-
-private:
-  InstX86AdjustStack(Cfg *Func, int32_t Amount, Variable *Esp);
-  const int32_t Amount;
-};
-
 /// Call instruction. Arguments should have already been pushed.
 template <class Machine> class InstX86Call final : public InstX86Base<Machine> {
  InstX86Call() = delete;
@@ -2791,7 +2764,6 @@ template <class Machine> struct Insts {
  using FakeRMW = InstX86FakeRMW<Machine>;
  using Label = InstX86Label<Machine>;

-  using AdjustStack = InstX86AdjustStack<Machine>;
  using Call = InstX86Call<Machine>;

  using Br = InstX86Br<Machine>;

--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -57,14 +57,6 @@ InstX86FakeRMW<Machine>::InstX86FakeRMW(Cfg *Func, Operand *Data, Operand *Addr,
 }

 template <class Machine>
-InstX86AdjustStack<Machine>::InstX86AdjustStack(Cfg *Func, int32_t Amount,
-                                                Variable *Esp)
-    : InstX86Base<Machine>(Func, InstX86Base<Machine>::Adjuststack, 1, Esp),
-      Amount(Amount) {
-  this->addSource(Esp);
-}
-
-template <class Machine>
 InstX86Mul<Machine>::InstX86Mul(Cfg *Func, Variable *Dest, Variable *Source1,
                                Operand *Source2)
    : InstX86Base<Machine>(Func, InstX86Base<Machine>::Mul, 2, Dest) {
@@ -2907,46 +2899,6 @@ template <class Machine> void InstX86Pop<Machine>::dump(const Cfg *Func) const {
 }

 template <class Machine>
-void InstX86AdjustStack<Machine>::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  if (Amount > 0)
-    Str << "\tsubl\t$" << Amount << ", %esp";
-  else
-    Str << "\taddl\t$" << -Amount << ", %esp";
-  auto *Target = InstX86Base<Machine>::getTarget(Func);
-  Target->updateStackAdjustment(Amount);
-}
-
-template <class Machine>
-void InstX86AdjustStack<Machine>::emitIAS(const Cfg *Func) const {
-  typename InstX86Base<Machine>::Traits::Assembler *Asm =
-      Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>();
-  if (Amount > 0)
-    Asm->sub(IceType_i32,
-             InstX86Base<Machine>::Traits::RegisterSet::Encoded_Reg_esp,
-             Immediate(Amount));
-  else
-    Asm->add(IceType_i32,
-             InstX86Base<Machine>::Traits::RegisterSet::Encoded_Reg_esp,
-             Immediate(-Amount));
-  auto *Target = InstX86Base<Machine>::getTarget(Func);
-  Target->updateStackAdjustment(Amount);
-}
-
-template <class Machine>
-void InstX86AdjustStack<Machine>::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  if (Amount > 0)
-    Str << "esp = sub.i32 esp, " << Amount;
-  else
-    Str << "esp = add.i32 esp, " << -Amount;
-}
-
-template <class Machine>
 void InstX86Push<Machine>::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;

--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -221,9 +221,6 @@ public:
  /// twice" attribute.
  bool callsReturnsTwice() const { return CallsReturnsTwice; }
  void setCallsReturnsTwice(bool RetTwice) { CallsReturnsTwice = RetTwice; }
-  int32_t getStackAdjustment() const { return StackAdjustment; }
-  void updateStackAdjustment(int32_t Offset) { StackAdjustment += Offset; }
-  void resetStackAdjustment() { StackAdjustment = 0; }
  SizeT makeNextLabelNumber() { return NextLabelNumber++; }
  SizeT makeNextJumpTableNumber() { return NextJumpTableNumber++; }
  LoweringContext &getContext() { return Context; }
@@ -251,17 +248,6 @@ public:
                                const llvm::SmallBitVector &ExcludeRegisters,
                                uint64_t Salt) const = 0;

-  /// Save/restore any mutable state for the situation where code emission needs
-  /// multiple passes, such as sandboxing or relaxation. Subclasses may provide
-  /// their own implementation, but should be sure to also call the parent
-  /// class's methods.
-  virtual void snapshotEmitState() {
-    SnapshotStackAdjustment = StackAdjustment;
-  }
-  virtual void rollbackEmitState() {
-    StackAdjustment = SnapshotStackAdjustment;
-  }
-
  /// Get the minimum number of clusters required for a jump table to be
  /// considered.
  virtual SizeT getMinJumpTableSize() const = 0;
@@ -393,10 +379,6 @@ protected:
  GlobalContext *Ctx;
  bool HasComputedFrame = false;
  bool CallsReturnsTwice = false;
-  /// StackAdjustment keeps track of the current stack offset from its natural
-  /// location, e.g. as arguments are pushed for a function call or as
-  /// fixed-size alloca instructions are executed in the entry block.
-  int32_t StackAdjustment = 0;
  SizeT NextLabelNumber = 0;
  SizeT NextJumpTableNumber = 0;
  LoweringContext Context;
@@ -438,9 +420,6 @@ protected:
  const static constexpr char *H_uitofp_i64_f64 = "__Sz_uitofp_i64_f64";
  const static constexpr char *H_urem_i32 = "__umodsi3";
  const static constexpr char *H_urem_i64 = "__umoddi3";
-
-private:
-  int32_t SnapshotStackAdjustment = 0;
 };

 /// TargetDataLowering is used for "lowering" data including initializers for

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -849,13 +849,13 @@ void TargetARM32::addProlog(CfgNode *Node) {
      GlobalsSize + LocalsSlotsPaddingBytes;

  // Adds the out args space to the stack, and align SP if necessary.
-  if (!NeedsStackAlignment) {
-    SpillAreaSizeBytes += MaxOutArgsSizeBytes;
-  } else {
+  if (NeedsStackAlignment) {
    uint32_t StackOffset = PreservedRegsSizeBytes;
    uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
    StackSize = applyStackAlignment(StackSize + MaxOutArgsSizeBytes);
    SpillAreaSizeBytes = StackSize - StackOffset;
+  } else {
+    SpillAreaSizeBytes += MaxOutArgsSizeBytes;
  }

  // Combine fixed alloca with SpillAreaSize.

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -152,8 +152,6 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
          Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
      auto *Mem = Traits::X86OperandMem::create(Func, Ty, esp, Loc);
-      // Stack stores for arguments are fixed to esp.
-      Mem->setIgnoreStackAdjust(true);
      StackArgLocations.push_back(Mem);
      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
    }
@@ -169,16 +167,8 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
  // Adjust the parameter area so that the stack is aligned. It is assumed that
  // the stack is already aligned at the start of the calling sequence.
  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
-
-  // Subtract the appropriate amount for the argument area. This also takes
-  // care of setting the stack adjustment during emission.
-  //
-  // TODO: If for some reason the call instruction gets dead-code eliminated
-  // after lowering, we would need to ensure that the pre-call and the
-  // post-call esp adjustment get eliminated as well.
-  if (ParameterAreaSizeBytes) {
-    _adjust_stack(ParameterAreaSizeBytes);
-  }
+  assert(static_cast<uint32_t>(ParameterAreaSizeBytes) <=
+         maxOutArgsSizeBytes());

  // Copy arguments that are passed on the stack to the appropriate stack
  // locations.
@@ -275,10 +265,6 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
    Context.insert(InstFakeUse::create(Func, Dest));
  }

-  // Add the appropriate offset to esp.
-  if (ParameterAreaSizeBytes)
-    _adjust_stack(-ParameterAreaSizeBytes);
-
  // Generate a FakeUse to keep the call live if necessary.
  if (Instr->hasSideEffects() && ReturnReg) {
    Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
@@ -391,6 +377,10 @@ void TargetX8632::addProlog(CfgNode *Node) {
  // +------------------------+
  // | 8. allocas             |
  // +------------------------+
+  // | 9. padding             |
+  // +------------------------+
+  // | 10. out args           |
+  // +------------------------+ <--- StackPointer
  //
  // The following variables record the size in bytes of the given areas:
  //  * X86_RET_IP_SIZE_BYTES:  area 1
@@ -399,7 +389,8 @@ void TargetX8632::addProlog(CfgNode *Node) {
  //  * GlobalsSize:            area 4
  //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
  //  * LocalsSpillAreaSize:    area 6
-  //  * SpillAreaSizeBytes:     areas 3 - 7
+  //  * SpillAreaSizeBytes:     areas 3 - 10
+  //  * maxOutArgsSizeBytes():  area 10

  // Determine stack frame offsets for each Variable without a register
  // assignment. This can be done as one variable per stack slot. Or, do
@@ -515,7 +506,10 @@ void TargetX8632::addProlog(CfgNode *Node) {
        Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
    uint32_t StackSize =
        Traits::applyStackAlignment(StackOffset + SpillAreaSizeBytes);
+    StackSize = Traits::applyStackAlignment(StackSize + maxOutArgsSizeBytes());
    SpillAreaSizeBytes = StackSize - StackOffset;
+  } else {
+    SpillAreaSizeBytes += maxOutArgsSizeBytes();
  }

  // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
@@ -543,12 +537,6 @@ void TargetX8632::addProlog(CfgNode *Node) {

  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);

-  // Initialize the stack adjustment so that after all the known-frame-offset
-  // alloca instructions are emitted, the stack adjustment will reach zero.
-  resetStackAdjustment();
-  if (!PrologEmitsFixedAllocas)
-    updateStackAdjustment(-FixedAllocaSizeBytes);
-
  // Fill in stack offsets for stack args, and copy args into registers for
  // those that were register-allocated. Args are pushed right to left, so
  // Arg[0] is closest to the stack/frame pointer.
@@ -599,7 +587,8 @@ void TargetX8632::addProlog(CfgNode *Node) {
    Str << "Stack layout:\n";
    uint32_t EspAdjustmentPaddingSize =
        SpillAreaSizeBytes - LocalsSpillAreaSize -
-        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
+        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
+        maxOutArgsSizeBytes();
    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
        << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
@@ -614,6 +603,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
    Str << "Stack details:\n"
        << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
+        << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
        << " bytes\n"
        << " is ebp based = " << IsEbpBasedFrame << "\n";

--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -752,9 +752,6 @@ template <> struct MachineTraits<TargetX8632> {

    bool getRandomized() const { return Randomized; }

-    void setIgnoreStackAdjust(bool Ignore) { IgnoreStackAdjust = Ignore; }
-    bool getIgnoreStackAdjust() const { return IgnoreStackAdjust; }
-
  private:
    X86OperandMem(Cfg *Func, Type Ty, Variable *Base, Constant *Offset,
                  Variable *Index, uint16_t Shift, SegmentRegisters SegmentReg);
@@ -768,11 +765,6 @@ template <> struct MachineTraits<TargetX8632> {
    /// memory operands are generated in
    /// TargetX86Base::randomizeOrPoolImmediate()
    bool Randomized;
-    /// Memory operations involving the stack pointer need to know when the
-    /// stack pointer was moved temporarily.  Ignore that adjustment in
-    /// cases that should be pinned to the stack pointer, such as outgoing
-    /// arguments to calls.
-    bool IgnoreStackAdjust = false;
  };

  /// VariableSplit is a way to treat an f64 memory location as a pair of i32

--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -170,7 +170,7 @@ void TargetX8664::lowerCall(const InstCall *Instr) {
  OperandList XmmArgs;
  OperandList GprArgs;
  OperandList StackArgs, StackArgLocations;
-  uint32_t ParameterAreaSizeBytes = 0;
+  int32_t ParameterAreaSizeBytes = 0;

  // Classify each argument operand according to the location where the
  // argument is passed.
@@ -204,16 +204,8 @@ void TargetX8664::lowerCall(const InstCall *Instr) {
  // Adjust the parameter area so that the stack is aligned. It is assumed that
  // the stack is already aligned at the start of the calling sequence.
  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
-
-  // Subtract the appropriate amount for the argument area. This also takes
-  // care of setting the stack adjustment during emission.
-  //
-  // TODO: If for some reason the call instruction gets dead-code eliminated
-  // after lowering, we would need to ensure that the pre-call and the
-  // post-call esp adjustment get eliminated as well.
-  if (ParameterAreaSizeBytes) {
-    _adjust_stack(ParameterAreaSizeBytes);
-  }
+  assert(static_cast<uint32_t>(ParameterAreaSizeBytes) <=
+         maxOutArgsSizeBytes());

  // Copy arguments that are passed on the stack to the appropriate stack
  // locations.
@@ -409,6 +401,10 @@ void TargetX8664::addProlog(CfgNode *Node) {
  // +------------------------+
  // | 8. allocas             |
  // +------------------------+
+  // | 9. padding             |
+  // +------------------------+
+  // | 10. out args           |
+  // +------------------------+ <--- StackPointer
  //
  // The following variables record the size in bytes of the given areas:
  //  * X86_RET_IP_SIZE_BYTES:  area 1
@@ -417,7 +413,8 @@ void TargetX8664::addProlog(CfgNode *Node) {
  //  * GlobalsSize:            area 4
  //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
  //  * LocalsSpillAreaSize:    area 6
-  //  * SpillAreaSizeBytes:     areas 3 - 7
+  //  * SpillAreaSizeBytes:     areas 3 - 10
+  //  * maxOutArgsSizeBytes():  area 10

  // Determine stack frame offsets for each Variable without a register
  // assignment. This can be done as one variable per stack slot. Or, do
@@ -514,24 +511,36 @@ void TargetX8664::addProlog(CfgNode *Node) {
        Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
    uint32_t StackSize =
        Traits::applyStackAlignment(StackOffset + SpillAreaSizeBytes);
+    StackSize = Traits::applyStackAlignment(StackSize + maxOutArgsSizeBytes());
    SpillAreaSizeBytes = StackSize - StackOffset;
+  } else {
+    SpillAreaSizeBytes += maxOutArgsSizeBytes();
  }

+  // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
+  // fixed allocations in the prolog.
+  if (PrologEmitsFixedAllocas)
+    SpillAreaSizeBytes += FixedAllocaSizeBytes;
  // Generate "sub esp, SpillAreaSizeBytes"
-  if (SpillAreaSizeBytes)
+  if (SpillAreaSizeBytes) {
    _sub(getPhysicalRegister(Traits::RegisterSet::Reg_esp),
         Ctx->getConstantInt32(SpillAreaSizeBytes));
+    // If the fixed allocas are aligned more than the stack frame, align the
+    // stack pointer accordingly.
+    if (PrologEmitsFixedAllocas &&
+        FixedAllocaAlignBytes > Traits::X86_STACK_ALIGNMENT_BYTES) {
+      assert(IsEbpBasedFrame);
+      _and(getPhysicalRegister(Traits::RegisterSet::Reg_esp),
+           Ctx->getConstantInt32(-FixedAllocaAlignBytes));
+    }
+  }

  // Account for alloca instructions with known frame offsets.
-  SpillAreaSizeBytes += FixedAllocaSizeBytes;
+  if (!PrologEmitsFixedAllocas)
+    SpillAreaSizeBytes += FixedAllocaSizeBytes;

  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);

-  // Initialize the stack adjustment so that after all the known-frame-offset
-  // alloca instructions are emitted, the stack adjustment will reach zero.
-  resetStackAdjustment();
-  updateStackAdjustment(-FixedAllocaSizeBytes);
-
  // Fill in stack offsets for stack args, and copy args into registers for
  // those that were register-allocated. Args are pushed right to left, so
  // Arg[0] is closest to the stack/frame pointer.
@@ -563,7 +572,9 @@ void TargetX8664::addProlog(CfgNode *Node) {
    // until after all the fixed-size alloca instructions have executed.  In
    // this case, a stack adjustment is needed when accessing in-args in order
    // to copy them into registers.
-    size_t StackAdjBytes = IsEbpBasedFrame ? 0 : -FixedAllocaSizeBytes;
+    size_t StackAdjBytes = 0;
+    if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
+      StackAdjBytes -= FixedAllocaSizeBytes;
    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
                           InArgsSizeBytes);
  }
@@ -588,7 +599,8 @@ void TargetX8664::addProlog(CfgNode *Node) {
    Str << "Stack layout:\n";
    uint32_t EspAdjustmentPaddingSize =
        SpillAreaSizeBytes - LocalsSpillAreaSize -
-        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
+        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
+        maxOutArgsSizeBytes();
    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
        << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
@@ -603,6 +615,7 @@ void TargetX8664::addProlog(CfgNode *Node) {
    Str << "Stack details:\n"
        << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
+        << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
        << " bytes\n"
        << " is ebp based = " << IsEbpBasedFrame << "\n";

--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -732,9 +732,6 @@ template <> struct MachineTraits<TargetX8664> {

    bool getRandomized() const { return Randomized; }

-    void setIgnoreStackAdjust(bool Ignore) { IgnoreStackAdjust = Ignore; }
-    bool getIgnoreStackAdjust() const { return IgnoreStackAdjust; }
-
  private:
    X86OperandMem(Cfg *Func, Type Ty, Variable *Base, Constant *Offset,
                  Variable *Index, uint16_t Shift);
@@ -747,11 +744,6 @@ template <> struct MachineTraits<TargetX8664> {
    /// memory operands are generated in
    /// TargetX86Base::randomizeOrPoolImmediate()
    bool Randomized = false;
-    /// Memory operations involving the stack pointer need to know when the
-    /// stack pointer was moved temporarily.  Ignore that adjustment in
-    /// cases that should be pinned to the stack pointer, such as outgoing
-    /// arguments to calls.
-    bool IgnoreStackAdjust = false;
  };

  /// VariableSplit is a way to treat an f64 memory location as a pair of i32

--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -109,7 +109,13 @@ public:
  }
  /// Returns the (negative) offset from ebp/rbp where the fixed Allocas start.
  int32_t getFrameFixedAllocaOffset() const override {
-    return FixedAllocaSizeBytes - SpillAreaSizeBytes;
+    return FixedAllocaSizeBytes - (SpillAreaSizeBytes - maxOutArgsSizeBytes());
+  }
+  virtual uint32_t maxOutArgsSizeBytes() const override {
+    return MaxOutArgsSizeBytes;
+  }
+  virtual void updateMaxOutArgsSizeBytes(uint32_t Size) {
+    MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, Size);
  }

  bool shouldSplitToVariable64On32(Type Ty) const override {
@@ -182,8 +188,10 @@ protected:
  void lowerOther(const Inst *Instr) override;
  void lowerRMW(const typename Traits::Insts::FakeRMW *RMW);
  void prelowerPhis() override;
+  uint32_t getCallStackArgumentsSizeBytes(const std::vector<Type> &ArgTypes,
+                                          Type ReturnType);
  uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override;
-  void genTargetHelperCallFor(Inst *Instr) override { (void)Instr; }
+  void genTargetHelperCallFor(Inst *Instr) override;
  void doAddressOptLoad() override;
  void doAddressOptStore() override;
  void doMockBoundsCheck(Operand *Opnd) override;
@@ -330,10 +338,6 @@ protected:
  void _add_rmw(typename Traits::X86OperandMem *DestSrc0, Operand *Src1) {
    Context.insert(Traits::Insts::AddRMW::create(Func, DestSrc0, Src1));
  }
-  void _adjust_stack(int32_t Amount) {
-    Context.insert(Traits::Insts::AdjustStack::create(
-        Func, Amount, getPhysicalRegister(Traits::RegisterSet::Reg_esp)));
-  }
  void _addps(Variable *Dest, Operand *Src0) {
    Context.insert(Traits::Insts::Addps::create(Func, Dest, Src0));
  }
@@ -705,6 +709,7 @@ protected:
  size_t FixedAllocaSizeBytes = 0;
  size_t FixedAllocaAlignBytes = 0;
  bool PrologEmitsFixedAllocas = false;
+  uint32_t MaxOutArgsSizeBytes = 0;
  static std::array<llvm::SmallBitVector, RCX86_NUM> TypeToRegisterSet;
  static std::array<llvm::SmallBitVector, Traits::RegisterSet::Reg_NUM>
      RegisterAliases;

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
--- a/tests_lit/assembler/x86/sandboxing.ll
+++ b/tests_lit/assembler/x86/sandboxing.ll
@@ -251,10 +251,10 @@ entry:
  ret void
 }
 ; CHECK-LABEL: checkpoint_restore_stack_adjustment
+; CHECK: sub esp,0x1c
 ; CHECK: call
-; CHECK: sub esp,0x10
 ; The address of %arg should be [esp+0x20], not [esp+0x30].
 ; CHECK-NEXT: mov [[REG:.*]],DWORD PTR [esp+0x20]
 ; CHECK-NEXT: mov DWORD PTR [esp],[[REG]]
 ; CHECK: call
-; CHECK: add esp,0x10
+; CHECK: add esp,0x1c
--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -53,14 +53,12 @@ entry:
 ; CHECK:      mov     DWORD PTR [esp+0x10]
 ; CHECK:      mov     DWORD PTR [esp+0xc]
 ; CHECK:      call {{.*}} R_{{.*}}    ignore64BitArgNoInline
-; CHECK:      sub     esp
 ; CHECK:      mov     DWORD PTR [esp+0x4]
 ; CHECK:      mov     DWORD PTR [esp]
 ; CHECK:      mov     DWORD PTR [esp+0x8],0x7b
 ; CHECK:      mov     DWORD PTR [esp+0x10]
 ; CHECK:      mov     DWORD PTR [esp+0xc]
 ; CHECK:      call {{.*}} R_{{.*}}    ignore64BitArgNoInline
-; CHECK:      sub     esp
 ; CHECK:      mov     DWORD PTR [esp+0x4]
 ; CHECK:      mov     DWORD PTR [esp]
 ; CHECK:      mov     DWORD PTR [esp+0x8],0x7b
@@ -76,14 +74,12 @@ entry:
 ; OPTM1:      mov     DWORD PTR [esp+0x10]
 ; OPTM1:      mov     DWORD PTR [esp+0xc]
 ; OPTM1:      call {{.*}} R_{{.*}}    ignore64BitArgNoInline
-; OPTM1:      sub     esp
 ; OPTM1:      mov     DWORD PTR [esp+0x4]
 ; OPTM1:      mov     DWORD PTR [esp]
 ; OPTM1:      mov     DWORD PTR [esp+0x8],0x7b
 ; OPTM1:      mov     DWORD PTR [esp+0x10]
 ; OPTM1:      mov     DWORD PTR [esp+0xc]
 ; OPTM1:      call {{.*}} R_{{.*}}    ignore64BitArgNoInline
-; OPTM1:      sub     esp
 ; OPTM1:      mov     DWORD PTR [esp+0x4]
 ; OPTM1:      mov     DWORD PTR [esp]
 ; OPTM1:      mov     DWORD PTR [esp+0x8],0x7b

--- a/tests_lit/llvm2ice_tests/align-spill-locations.ll
+++ b/tests_lit/llvm2ice_tests/align-spill-locations.ll
@@ -83,9 +83,9 @@ block:
  ret <4 x i32> %vec.local
 ; CHECK-LABEL: align_local_vector_and_global_float
 ; CHECK: cvtsi2ss xmm0,eax
-; CHECK-NEXT: movss DWORD PTR [esp+{{0xc|0x1c}}],xmm0
-; CHECK: movups xmm0,XMMWORD PTR [{{esp|esp\+0x10}}]
-; CHECK-NEXT: add esp,0x2c
+; CHECK-NEXT: movss DWORD PTR [esp+{{0x1c|0x2c}}],xmm0
+; CHECK: movups xmm0,XMMWORD PTR [{{esp\+0x10|esp\+0x20}}]
+; CHECK-NEXT: add esp,0x3c
 ; CHECK-NEXT: ret
 }


--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
@@ -34,15 +34,14 @@ entry:
  ret void
 }
 ; CHECK-LABEL: fixed_416_align_16
-; CHECK:      sub     esp,0x1ac
-; CHECK:      sub     esp,0x10
+; CHECK:      sub     esp,0x1bc
+; CHECK:      lea     eax,[esp+0x10]
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1

 ; CHECK-OPTM1-LABEL: fixed_416_align_16
-; CHECK-OPTM1:      sub     esp,0xc
+; CHECK-OPTM1:      sub     esp,0x18
 ; CHECK-OPTM1:      sub     esp,0x1a0
-; CHECK-OPTM1:      sub     esp,0x10
 ; CHECK-OPTM1:      mov     DWORD PTR [esp],eax
 ; CHECK-OPTM1:      call {{.*}} R_{{.*}}    f1

@@ -61,9 +60,9 @@ entry:
 ; CHECK-LABEL: fixed_416_align_32
 ; CHECK:      push    ebp
 ; CHECK-NEXT: mov     ebp,esp
-; CHECK:      sub     esp,0x1a8
+; CHECK:      sub     esp,0x1b8
 ; CHECK:      and     esp,0xffffffe0
-; CHECK:      sub     esp,0x10
+; CHECK:      lea     eax,[esp+0x10]
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1

@@ -82,12 +81,13 @@ entry:
  ret void
 }
 ; CHECK-LABEL: fixed_351_align_16
-; CHECK:      sub     esp,0x16c
+; CHECK:      sub     esp,0x17c
+; CHECK:      lea     eax,[esp+0x10]
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1

 ; CHECK-OPTM1-LABEL: fixed_351_align_16
-; CHECK-OPTM1:      sub     esp,0xc
+; CHECK-OPTM1:      sub     esp,0x18
 ; CHECK-OPTM1:      sub     esp,0x160
 ; CHECK-OPTM1:      mov     DWORD PTR [esp],eax
 ; CHECK-OPTM1:      call {{.*}} R_{{.*}}    f1
@@ -107,9 +107,9 @@ entry:
 ; CHECK-LABEL: fixed_351_align_32
 ; CHECK:      push    ebp
 ; CHECK-NEXT: mov     ebp,esp
-; CHECK:      sub     esp,0x168
+; CHECK:      sub     esp,0x178
 ; CHECK:      and     esp,0xffffffe0
-; CHECK:      sub     esp,0x10
+; CHECK:      lea     eax,[esp+0x10]
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1

@@ -131,11 +131,12 @@ entry:
  ret void
 }
 ; CHECK-LABEL: variable_n_align_16
+; CHECK:      sub     esp,0x18
 ; CHECK:      mov     eax,DWORD PTR [ebp+0x8]
 ; CHECK:      add     eax,0xf
 ; CHECK:      and     eax,0xfffffff0
 ; CHECK:      sub     esp,eax
-; CHECK:      sub     esp,0x10
+; CHECK:      lea     eax,[esp+0x10]
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f2

@@ -156,12 +157,13 @@ entry:
 ; CHECK-LABEL: variable_n_align_32
 ; CHECK:      push    ebp
 ; CHECK:      mov     ebp,esp
+; CHECK:      sub     esp,0x18
 ; CHECK-DAG:  and     esp,0xffffffe0
 ; CHECK-DAG:  mov     eax,DWORD PTR [ebp+0x8]
 ; CHECK:      add     eax,0x1f
 ; CHECK:      and     eax,0xffffffe0
 ; CHECK:      sub     esp,eax
-; CHECK:      sub     esp,0x10
+; CHECK:      lea     eax,[esp+0x10]
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f2
 ; CHECK:      mov     esp,ebp

--- a/tests_lit/llvm2ice_tests/commutativity.ll
+++ b/tests_lit/llvm2ice_tests/commutativity.ll
@@ -61,7 +61,7 @@ entry:
  ret float %result
 }
 ; CHECK-LABEL: floatAddLeft
-; CHECK-NEXT: sub esp,0xc
+; CHECK-NEXT: sub esp,0x1c
 ; CHECK-NEXT: movss xmm0,DWORD PTR
 ; CHECK-NEXT: movss xmm1,DWORD PTR
 ; CHECK-NEXT: addss xmm1,xmm0
@@ -74,7 +74,7 @@ entry:
  ret float %result
 }
 ; CHECK-LABEL: floatAddRight
-; CHECK-NEXT: sub esp,0xc
+; CHECK-NEXT: sub esp,0x1c
 ; CHECK-NEXT: movss xmm0,DWORD PTR
 ; CHECK-NEXT: movss xmm1,DWORD PTR
 ; CHECK-NEXT: addss xmm0,xmm1
@@ -87,7 +87,7 @@ entry:
  ret float %result
 }
 ; CHECK-LABEL: floatMultiplyLeft
-; CHECK-NEXT: sub esp,0xc
+; CHECK-NEXT: sub esp,0x1c
 ; CHECK-NEXT: movss xmm0,DWORD PTR
 ; CHECK-NEXT: movss xmm1,DWORD PTR
 ; CHECK-NEXT: mulss xmm1,xmm0
@@ -100,7 +100,7 @@ entry:
  ret float %result
 }
 ; CHECK-LABEL: floatMultiplyRight
-; CHECK-NEXT: sub esp,0xc
+; CHECK-NEXT: sub esp,0x1c
 ; CHECK-NEXT: movss xmm0,DWORD PTR
 ; CHECK-NEXT: movss xmm1,DWORD PTR
 ; CHECK-NEXT: mulss xmm0,xmm1

--- a/tests_lit/llvm2ice_tests/ebp_args.ll
+++ b/tests_lit/llvm2ice_tests/ebp_args.ll
@@ -27,14 +27,14 @@ eblock:
 ; CHECK-LABEL: memcpy_helper
 ; CHECK:  push  ebp
 ; CHECK:  mov   ebp,esp
-; CHECK:  sub   esp,0x18
+; CHECK:  sub   esp,0x28
 ; CHECK:  sub   esp,0x80
-; CHECK:  mov   DWORD PTR [ebp-0x4],esp
+; CHECK:  lea   eax,[esp+0x10]
+; CHECK:  mov   DWORD PTR [ebp-0x4],eax
 ; CHECK:  mov   eax,DWORD PTR [ebp+0xc]
 ; CHECK:  mov   BYTE PTR [ebp-0x8],al
 ; CHECK:  movzx eax,BYTE PTR [ebp-0x8]
 ; CHECK:  mov   DWORD PTR [ebp-0xc],eax
-; CHECK:  sub   esp,0x10
 ; CHECK:  mov   eax,DWORD PTR [ebp+0x8]
 ; CHECK:  mov   DWORD PTR [esp],eax
 ; CHECK:  mov   eax,DWORD PTR [ebp-0x4]

--- a/tests_lit/llvm2ice_tests/elf_function_sections.ll
+++ b/tests_lit/llvm2ice_tests/elf_function_sections.ll
@@ -77,13 +77,13 @@ define void @_start(i32 %x) {

 ; CHECK: Relocations [
 ; CHECK:   Section ({{[0-9]+}}) .rel.text.foo {
-; CHECK:     0x21 R_386_PC32 memcpy 0x0
+; CHECK:     0x1E R_386_PC32 memcpy 0x0
 ; CHECK:   }
 ;   Relocation can be against the start of the section or
 ;   the function's symbol itself.
 ; CHECK:   Section ({{[0-9]+}}) .rel.text._start {
-; CHECK:     0x13 R_386_PC32 {{.*}}bar 0x0
-; CHECK:     0x25 R_386_PC32 {{.*}}foo 0x0
+; CHECK:     0x10 R_386_PC32 {{.*}}bar 0x0
+; CHECK:     0x1C R_386_PC32 {{.*}}foo 0x0
 ; CHECK:   }
 ; CHECK: ]


--- a/tests_lit/llvm2ice_tests/elf_nodata.ll
+++ b/tests_lit/llvm2ice_tests/elf_nodata.ll
@@ -83,7 +83,7 @@ define void @_start(i32 %x) {

 ; CHECK: Relocations [
 ; CHECK:   Section ({{[0-9]+}}) .rel.text {
-; CHECK:     0x21 R_386_PC32 memcpy 0x0
+; CHECK:     0x1E R_386_PC32 memcpy 0x0
 ; CHECK:   }
 ; CHECK: ]


--- a/tests_lit/llvm2ice_tests/fused-alloca-arg.ll
+++ b/tests_lit/llvm2ice_tests/fused-alloca-arg.ll
@@ -17,10 +17,9 @@ entry:
 }

 ; CHECK-LABEL:  caller1
-; CHECK-NEXT:   sub    esp,0x2c
-; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x30]
-; CHECK-NEXT:   mov    DWORD PTR [esp],eax
-; CHECK-NEXT:   sub    esp,0x20
+; CHECK-NEXT:   sub    esp,0x4c
+; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x50]
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x20],eax
 ; CHECK-NEXT:   mov    DWORD PTR [esp],eax
 ; CHECK-NEXT:   lea    eax,[esp+0x20]
 ; CHECK-NEXT:   mov    DWORD PTR [esp+0x4],eax
@@ -31,8 +30,7 @@ entry:
 ; CHECK-NEXT:   lea    eax,[esp+0x20]
 ; CHECK-NEXT:   mov    DWORD PTR [esp+0x10],eax
 ; CHECK-NEXT:   call
-; CHECK-NEXT:   add    esp,0x20
-; CHECK-NEXT:   add    esp,0x2c
+; CHECK-NEXT:   add    esp,0x4c
 ; CHECK-NEXT:   ret

 ; Test that alloca base addresses get passed correctly to functions.
@@ -49,11 +47,10 @@ entry:
 }

 ; CHECK-LABEL:  caller2
-; CHECK-NEXT:   sub    esp,0x4c
-; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x50]
-; CHECK-NEXT:   mov    DWORD PTR [esp],eax
+; CHECK-NEXT:   sub    esp,0x6c
+; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x70]
 ; CHECK-NEXT:   mov    DWORD PTR [esp+0x20],eax
-; CHECK-NEXT:   sub    esp,0x20
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x40],eax
 ; CHECK-NEXT:   mov    DWORD PTR [esp],eax
 ; CHECK-NEXT:   lea    eax,[esp+0x20]
 ; CHECK-NEXT:   mov    DWORD PTR [esp+0x4],eax
@@ -64,6 +61,5 @@ entry:
 ; CHECK-NEXT:   lea    eax,[esp+0x40]
 ; CHECK-NEXT:   mov    DWORD PTR [esp+0x10],eax
 ; CHECK-NEXT:   call
-; CHECK-NEXT:   add    esp,0x20
-; CHECK-NEXT:   add    esp,0x4c
+; CHECK-NEXT:   add    esp,0x6c
 ; CHECK-NEXT:   ret
--- a/tests_lit/llvm2ice_tests/fused-alloca.ll
+++ b/tests_lit/llvm2ice_tests/fused-alloca.ll
@@ -144,7 +144,7 @@ next:
 ; CHECK-NEXT: push   ebx
 ; CHECK-NEXT: push   ebp
 ; CHECK-NEXT: mov    ebp,esp
-; CHECK-NEXT: sub    esp,0x64
+; CHECK-NEXT: sub    esp,0x24
 ; CHECK-NEXT: mov    eax,DWORD PTR [ebp+0xc]
 ; CHECK-NEXT: and    esp,0xffffffe0
 ; CHECK-NEXT: sub    esp,0x40

--- a/tests_lit/llvm2ice_tests/icmp-with-zero.ll
+++ b/tests_lit/llvm2ice_tests/icmp-with-zero.ll
@@ -149,8 +149,8 @@ if.end:                                          ; preds = %if.then, %if.end
 ; CHECK-LABEL: icmpUltZero32
 ; CHECK: mov [[RESULT:.*]],0x0
 ; CHECK-NEXT: cmp [[RESULT]],0x0
-; OPTM1: icmpUltZero32
-; OPTM1 [[RESULT:.*]],0x0
+; OPTM1-LABEL: icmpUltZero32
+; OPTM1: mov [[RESULT:.*]],0x0
 ; OPTM1: cmp [[RESULT]],0x0

 define internal void @icmpUgeZero32() {

--- a/tests_lit/llvm2ice_tests/int-arg.ll
+++ b/tests_lit/llvm2ice_tests/int-arg.ll
@@ -277,10 +277,10 @@ define internal void @test_passing_integers(i32 %arg0, i32 %arg1, i32 %arg2, i32
 }

 ; CHECK-LABEL: test_passing_integers
-; CHECK-DAG: mov [[REG1:e.*]],DWORD PTR [esp+0x24]
-; CHECK-DAG: mov [[REG2:e.*]],DWORD PTR [esp+0x28]
-; CHECK-DAG: mov [[REG3:e.*]],DWORD PTR [esp+0x2c]
-; CHECK-DAG: mov [[REG4:e.*]],DWORD PTR [esp+0x30]
+; CHECK-DAG: mov [[REG1:e.*]],DWORD PTR [esp+0x44]
+; CHECK-DAG: mov [[REG2:e.*]],DWORD PTR [esp+0x48]
+; CHECK-DAG: mov [[REG3:e.*]],DWORD PTR [esp+0x4c]
+; CHECK-DAG: mov [[REG4:e.*]],DWORD PTR [esp+0x50]
 ; CHECK: mov DWORD PTR [esp]
 ; CHECK: mov DWORD PTR [esp+0x4]
 ; CHECK-DAG: mov DWORD PTR [esp+0x8],[[REG4]]

--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -540,11 +540,15 @@ entry:
  ret void
 }
 ; CHECK-LABEL: test_stacksave_multiple
-; At least 3 copies of esp, but probably more from having to do the allocas.
-; CHECK: mov {{.*}},esp
-; CHECK: mov {{.*}},esp
+; lea is used to copy from esp for the allocas.
+; Otherwise, only one stacksave is live.
+; CHECK: mov ebp,esp
 ; CHECK: mov {{.*}},esp
+; CHECK: lea {{.*}},[esp+0x10]
+; CHECK: lea {{.*}},[esp+0x10]
+; CHECK: call
 ; CHECK: mov esp,{{.*}}
+; CHECK: mov esp,ebp
 ; ARM32-LABEL: test_stacksave_multiple
 ; ARM32: mov {{.*}}, sp
 ; ARM32: mov {{.*}}, sp

--- a/tests_lit/llvm2ice_tests/vector-arg.ll
+++ b/tests_lit/llvm2ice_tests/vector-arg.ll
@@ -190,7 +190,7 @@ entry:
                        <4 x float> %arg6, <4 x float> %arg5, <4 x float> %arg4)
  ret void
 ; CHECK-LABEL: test_passing_vectors
-; CHECK: sub esp,0x20
+; CHECK-NEXT: sub esp,0x2c
 ; CHECK: movups  [[ARG5:.*]],XMMWORD PTR [esp+0x40]
 ; CHECK: movups  XMMWORD PTR [esp],[[ARG5]]
 ; CHECK: movups  [[ARG6:.*]],XMMWORD PTR [esp+0x30]
@@ -200,10 +200,10 @@ entry:
 ; CHECK: movups  xmm2,XMMWORD PTR [esp+0x60]
 ; CHECK: movups  xmm3,XMMWORD PTR [esp+0x50]
 ; CHECK: call {{.*}} R_{{.*}} VectorArgs
-; CHECK-NEXT: add esp,0x20
+; CHECK-NEXT: add esp,0x2c

 ; OPTM1-LABEL: test_passing_vectors
-; OPTM1: sub esp,0x20
+; OPTM1: sub esp,0x6c
 ; OPTM1: movups  [[ARG5:.*]],XMMWORD PTR {{.*}}
 ; OPTM1: movups  XMMWORD PTR [esp],[[ARG5]]
 ; OPTM1: movups  [[ARG6:.*]],XMMWORD PTR {{.*}}
@@ -213,7 +213,7 @@ entry:
 ; OPTM1: movups  xmm2,XMMWORD PTR {{.*}}
 ; OPTM1: movups  xmm3,XMMWORD PTR {{.*}}
 ; OPTM1: call {{.*}} R_{{.*}} VectorArgs
-; OPTM1-NEXT: add esp,0x20
+; OPTM1-NEXT: add esp,0x6c
 }

 declare void @InterspersedVectorArgs(
@@ -234,7 +234,7 @@ entry:
                                    <4 x float> %arg5, i32 4, <4 x float> %arg4)
  ret void
 ; CHECK-LABEL: test_passing_vectors_interspersed
-; CHECK: sub esp,0x50
+; CHECK: sub esp,0x5c
 ; CHECK: movups  [[ARG9:.*]],XMMWORD PTR [esp+0x70]
 ; CHECK: movups  XMMWORD PTR [esp+0x20],[[ARG9]]
 ; CHECK: movups  [[ARG11:.*]],XMMWORD PTR [esp+0x60]
@@ -244,11 +244,11 @@ entry:
 ; CHECK: movups  xmm2,XMMWORD PTR [esp+0x90]
 ; CHECK: movups  xmm3,XMMWORD PTR [esp+0x80]
 ; CHECK: call {{.*}} R_{{.*}} InterspersedVectorArgs
-; CHECK-NEXT: add esp,0x50
+; CHECK-NEXT: add esp,0x5c
 ; CHECK: ret

 ; OPTM1-LABEL: test_passing_vectors_interspersed
-; OPTM1: sub esp,0x50
+; OPTM1: sub esp,0x9c
 ; OPTM1: movups  [[ARG9:.*]],XMMWORD PTR {{.*}}
 ; OPTM1: movups  XMMWORD PTR [esp+0x20],[[ARG9]]
 ; OPTM1: movups  [[ARG11:.*]],XMMWORD PTR {{.*}}
@@ -258,7 +258,7 @@ entry:
 ; OPTM1: movups  xmm2,XMMWORD PTR {{.*}}
 ; OPTM1: movups  xmm3,XMMWORD PTR {{.*}}
 ; OPTM1: call {{.*}} R_{{.*}} InterspersedVectorArgs
-; OPTM1-NEXT: add esp,0x50
+; OPTM1-NEXT: add esp,0x9c
 ; OPTM1: ret
 }