Subzero. ARM32. No more SP frobbing.

Pre-computes the max stack size outgoing arguments, and pre-allocates it during prolog, deallocating during epilog. With this CL, there are no more StackAdjustments needed for the ARM32, which will simplify rematerializing alloca'd variables. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=sehr@chromium.org Review URL: https://codereview.chromium.org/1467473003 .

Subzero. ARM32. No more SP frobbing.
f4198548 · John Porto · 5e0a8a71 · f4198548 · f4198548 · f4198548
Commit f4198548 authored Nov 20, 2015 by John Porto
9 changed files
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -382,13 +382,6 @@ OperandARM32FlexReg::OperandARM32FlexReg(Cfg *Func, Type Ty, Variable *Reg,
    Vars[1] = ShiftVar;
 }
-InstARM32AdjustStack::InstARM32AdjustStack(Cfg *Func, Variable *SP,
-                                           SizeT Amount, Operand *SrcAmount)
-    : InstARM32(Func, InstARM32::Adjuststack, 2, SP), Amount(Amount) {
-  addSource(SP);
-  addSource(SrcAmount);
-}
 InstARM32Br::InstARM32Br(Cfg *Func, const CfgNode *TargetTrue,
                         const CfgNode *TargetFalse,
                         const InstARM32Label *Label, CondARM32::Cond Pred)
@@ -1272,33 +1265,6 @@ void InstARM32Pop::dump(const Cfg *Func) const {
  }
 }
-void InstARM32AdjustStack::emit(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(getSrcSize() == 2);
-  Str << "\t"
-      << "sub"
-      << "\t";
-  getDest()->emit(Func);
-  Str << ", ";
-  getSrc(0)->emit(Func);
-  Str << ", ";
-  getSrc(1)->emit(Func);
-  Func->getTarget()->updateStackAdjustment(Amount);
-}
-void InstARM32AdjustStack::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  getDest()->dump(Func);
-  Str << " = sub.i32 ";
-  getSrc(0)->dump(Func);
-  Str << ", " << Amount << " ; ";
-  getSrc(1)->dump(Func);
-}
 void InstARM32Push::emit(const Cfg *Func) const {
  // TODO(jpp): Improve FP register save/restore.
  if (!BuildDefs::dump())

--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -980,34 +980,6 @@ private:
  const InstARM32Label *Label; // Intra-block branch target
 };
-/// AdjustStack instruction - subtracts SP by the given amount and updates the
-/// stack offset during code emission.
-class InstARM32AdjustStack : public InstARM32 {
-  InstARM32AdjustStack() = delete;
-  InstARM32AdjustStack(const InstARM32AdjustStack &) = delete;
-  InstARM32AdjustStack &operator=(const InstARM32AdjustStack &) = delete;
-public:
-  /// Note: We need both Amount and SrcAmount. If Amount is too large then it
-  /// needs to be copied to a register (so SrcAmount could be a register).
-  /// However, we also need the numeric Amount for bookkeeping, and it's hard to
-  /// pull that from the generic SrcAmount operand.
-  static InstARM32AdjustStack *create(Cfg *Func, Variable *SP, SizeT Amount,
-                                      Operand *SrcAmount) {
-    return new (Func->allocate<InstARM32AdjustStack>())
-        InstARM32AdjustStack(Func, SP, Amount, SrcAmount);
-  }
-  void emit(const Cfg *Func) const override;
-  void dump(const Cfg *Func) const override;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Adjuststack); }
-  SizeT getAmount() const { return Amount; }
-private:
-  InstARM32AdjustStack(Cfg *Func, Variable *SP, SizeT Amount,
-                       Operand *SrcAmount);
-  const SizeT Amount;
-};
 /// Call instruction (bl/blx). Arguments should have already been pushed.
 /// Technically bl and the register form of blx can be predicated, but we'll
 /// leave that out until needed.

--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -311,6 +311,7 @@ protected:
  virtual void lowerOther(const Inst *Instr);
  virtual void genTargetHelperCallFor(Inst *Instr) = 0;
+  virtual uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) = 0;
  virtual void doAddressOptLoad() {}
  virtual void doAddressOptStore() {}

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -233,12 +233,62 @@ void copyRegAllocFromInfWeightVariable64On32(const VarList &Vars) {
 }
 } // end of anonymous namespace
+uint32_t TargetARM32::getCallStackArgumentsSizeBytes(const InstCall *Call) {
+  TargetARM32::CallingConv CC;
+  size_t OutArgsSizeBytes = 0;
+  for (SizeT i = 0, NumArgs = Call->getNumArgs(); i < NumArgs; ++i) {
+    Operand *Arg = legalizeUndef(Call->getArg(i));
+    Type Ty = Arg->getType();
+    if (Ty == IceType_i64) {
+      std::pair<int32_t, int32_t> Regs;
+      if (CC.I64InRegs(&Regs)) {
+        continue;
+      }
+    } else if (isVectorType(Ty) || isFloatingType(Ty)) {
+      int32_t Reg;
+      if (CC.FPInReg(Ty, &Reg)) {
+        continue;
+      }
+    } else {
+      assert(Ty == IceType_i32);
+      int32_t Reg;
+      if (CC.I32InReg(&Reg)) {
+        continue;
+      }
+    }
+    OutArgsSizeBytes = applyStackAlignmentTy(OutArgsSizeBytes, Ty);
+    OutArgsSizeBytes += typeWidthInBytesOnStack(Ty);
+  }
+  return applyStackAlignment(OutArgsSizeBytes);
+}
+void TargetARM32::findMaxStackOutArgsSize() {
+  // MinNeededOutArgsBytes should be updated if the Target ever creates an
+  // high-level InstCall that requires more stack bytes.
+  constexpr size_t MinNeededOutArgsBytes = 0;
+  MaxOutArgsSizeBytes = MinNeededOutArgsBytes;
+  for (CfgNode *Node : Func->getNodes()) {
+    Context.init(Node);
+    while (!Context.atEnd()) {
+      PostIncrLoweringContext PostIncrement(Context);
+      Inst *CurInstr = Context.getCur();
+      if (auto *Call = llvm::dyn_cast<InstCall>(CurInstr)) {
+        SizeT OutArgsSizeBytes = getCallStackArgumentsSizeBytes(Call);
+        MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, OutArgsSizeBytes);
+      }
+    }
+  }
+}
 void TargetARM32::translateO2() {
  TimerMarker T(TimerStack::TT_O2, Func);
  // TODO(stichnot): share passes with X86?
  // https://code.google.com/p/nativeclient/issues/detail?id=4094
  genTargetHelperCalls();
+  findMaxStackOutArgsSize();
  // Do not merge Alloca instructions, and lay out the stack.
  static constexpr bool SortAndCombineAllocas = false;
@@ -346,6 +396,7 @@ void TargetARM32::translateOm1() {
  // TODO: share passes with X86?
  genTargetHelperCalls();
+  findMaxStackOutArgsSize();
  // Do not merge Alloca instructions, and lay out the stack.
  static constexpr bool SortAndCombineAllocas = false;
@@ -473,8 +524,6 @@ void TargetARM32::emitVariable(const Variable *Var) const {
  int32_t BaseRegNum = Var->getBaseRegNum();
  if (BaseRegNum == Variable::NoRegister) {
    BaseRegNum = getFrameOrStackReg();
-    if (!hasFramePointer())
-      Offset += getStackAdjustment();
  }
  const Type VarTy = Var->getType();
  Str << "[" << getRegName(BaseRegNum, VarTy);
@@ -670,7 +719,11 @@ void TargetARM32::addProlog(CfgNode *Node) {
  // +------------------------+
  // | 6. padding             |
  // +------------------------+
-  // | 7. allocas             |
+  // | 7. allocas (variable)  |
+  // +------------------------+
+  // | 8. padding             |
+  // +------------------------+
+  // | 9. out args            |
  // +------------------------+ <--- StackPointer
  //
  // The following variables record the size in bytes of the given areas:
@@ -679,7 +732,9 @@ void TargetARM32::addProlog(CfgNode *Node) {
  //  * GlobalsSize:            area 3
  //  * GlobalsAndSubsequentPaddingSize: areas 3 - 4
  //  * LocalsSpillAreaSize:    area 5
-  //  * SpillAreaSizeBytes:     areas 2 - 6
+  //  * SpillAreaSizeBytes:     areas 2 - 6, and 9
+  //  * MaxOutArgsSizeBytes:    area 9
+  //
  // Determine stack frame offsets for each Variable without a register
  // assignment.  This can be done as one variable per stack slot.  Or, do
  // coalescing by running the register allocator again with an infinite set of
@@ -785,10 +840,13 @@ void TargetARM32::addProlog(CfgNode *Node) {
  uint32_t GlobalsAndSubsequentPaddingSize =
      GlobalsSize + LocalsSlotsPaddingBytes;
-  // Align SP if necessary.
+  // Adds the out args space to the stack, and align SP if necessary.
-  if (NeedsStackAlignment) {
+  if (!NeedsStackAlignment) {
+    SpillAreaSizeBytes += MaxOutArgsSizeBytes;
+  } else {
    uint32_t StackOffset = PreservedRegsSizeBytes;
    uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
+    StackSize = applyStackAlignment(StackSize + MaxOutArgsSizeBytes);
    SpillAreaSizeBytes = StackSize - StackOffset;
  }
@@ -802,8 +860,6 @@ void TargetARM32::addProlog(CfgNode *Node) {
  }
  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
-  resetStackAdjustment();
  // Fill in stack offsets for stack args, and copy args into registers for
  // those that were register-allocated. Args are pushed right to left, so
  // Arg[0] is closest to the stack/frame pointer.
@@ -847,7 +903,8 @@ void TargetARM32::addProlog(CfgNode *Node) {
    Str << "Stack layout:\n";
    uint32_t SPAdjustmentPaddingSize =
        SpillAreaSizeBytes - LocalsSpillAreaSize -
-        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
+        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
+        MaxOutArgsSizeBytes;
    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
        << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
@@ -860,6 +917,7 @@ void TargetARM32::addProlog(CfgNode *Node) {
    Str << "Stack details:\n"
        << " SP adjustment = " << SpillAreaSizeBytes << " bytes\n"
        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
+        << " outgoing args size = " << MaxOutArgsSizeBytes << " bytes\n"
        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
        << " bytes\n"
        << " is FP based = " << UsesFramePointer << "\n";
@@ -956,10 +1014,7 @@ bool TargetARM32::isLegalMemOffset(Type Ty, int32_t Offset) const {
  return OperandARM32Mem::canHoldOffset(Ty, ZeroExt, Offset);
 }
-Variable *TargetARM32::newBaseRegister(int32_t OriginalOffset,
+Variable *TargetARM32::newBaseRegister(int32_t Offset, Variable *OrigBaseReg) {
-                                       int32_t StackAdjust,
-                                       Variable *OrigBaseReg) {
-  int32_t Offset = OriginalOffset + StackAdjust;
  // Legalize will likely need a movw/movt combination, but if the top bits are
  // all 0 from negating the offset and subtracting, we could use that instead.
  bool ShouldSub = (-Offset & 0xFFFF0000) == 0;
@@ -976,26 +1031,25 @@ Variable *TargetARM32::newBaseRegister(int32_t OriginalOffset,
 }
 OperandARM32Mem *TargetARM32::createMemOperand(Type Ty, int32_t Offset,
-                                               int32_t StackAdjust,
                                               Variable *OrigBaseReg,
                                               Variable **NewBaseReg,
                                               int32_t *NewBaseOffset) {
-  if (isLegalMemOffset(Ty, Offset + StackAdjust)) {
+  if (isLegalMemOffset(Ty, Offset)) {
    return OperandARM32Mem::create(
-        Func, Ty, OrigBaseReg, llvm::cast<ConstantInteger32>(
+        Func, Ty, OrigBaseReg,
-                                   Ctx->getConstantInt32(Offset + StackAdjust)),
+        llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(Offset)),
        OperandARM32Mem::Offset);
  }
  if (*NewBaseReg == nullptr) {
-    *NewBaseReg = newBaseRegister(Offset, StackAdjust, OrigBaseReg);
+    *NewBaseReg = newBaseRegister(Offset, OrigBaseReg);
-    *NewBaseOffset = Offset + StackAdjust;
+    *NewBaseOffset = Offset;
  }
-  int32_t OffsetDiff = Offset + StackAdjust - *NewBaseOffset;
+  int32_t OffsetDiff = Offset - *NewBaseOffset;
  if (!isLegalMemOffset(Ty, OffsetDiff)) {
-    *NewBaseReg = newBaseRegister(Offset, StackAdjust, OrigBaseReg);
+    *NewBaseReg = newBaseRegister(Offset, OrigBaseReg);
-    *NewBaseOffset = Offset + StackAdjust;
+    *NewBaseOffset = Offset;
    OffsetDiff = 0;
  }
@@ -1005,9 +1059,8 @@ OperandARM32Mem *TargetARM32::createMemOperand(Type Ty, int32_t Offset,
      OperandARM32Mem::Offset);
 }
-void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, int32_t StackAdjust,
+void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, Variable *OrigBaseReg,
-                              Variable *OrigBaseReg, Variable **NewBaseReg,
+                              Variable **NewBaseReg, int32_t *NewBaseOffset) {
-                              int32_t *NewBaseOffset) {
  Variable *Dest = MovInstr->getDest();
  assert(Dest != nullptr);
  Type DestTy = Dest->getType();
@@ -1027,8 +1080,8 @@ void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, int32_t StackAdjust,
    assert(SrcR->hasReg());
    const int32_t Offset = Dest->getStackOffset();
    // This is a _mov(Mem(), Variable), i.e., a store.
-    _str(SrcR, createMemOperand(DestTy, Offset, StackAdjust, OrigBaseReg,
+    _str(SrcR, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
-                                NewBaseReg, NewBaseOffset),
+                                NewBaseOffset),
         MovInstr->getPredicate());
    // _str() does not have a Dest, so we add a fake-def(Dest).
    Context.insert(InstFakeDef::create(Func, Dest));
@@ -1036,8 +1089,8 @@ void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, int32_t StackAdjust,
  } else if (auto *Var = llvm::dyn_cast<Variable>(Src)) {
    if (!Var->hasReg()) {
      const int32_t Offset = Var->getStackOffset();
-      _ldr(Dest, createMemOperand(DestTy, Offset, StackAdjust, OrigBaseReg,
+      _ldr(Dest, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
-                                  NewBaseReg, NewBaseOffset),
+                                  NewBaseOffset),
           MovInstr->getPredicate());
      Legalized = true;
    }
@@ -1064,7 +1117,6 @@ void TargetARM32::legalizeStackSlots() {
  Func->dump("Before legalizeStackSlots");
  assert(hasComputedFrame());
  Variable *OrigBaseReg = getPhysicalRegister(getFrameOrStackReg());
-  int32_t StackAdjust = 0;
  // Do a fairly naive greedy clustering for now. Pick the first stack slot
  // that's out of bounds and make a new base reg using the architecture's temp
  // register. If that works for the next slot, then great. Otherwise, create a
@@ -1091,23 +1143,8 @@ void TargetARM32::legalizeStackSlots() {
        NewBaseOffset = 0;
      }
-      // The stack adjustment only matters if we are using SP instead of FP.
-      if (!hasFramePointer()) {
-        if (auto *AdjInst = llvm::dyn_cast<InstARM32AdjustStack>(CurInstr)) {
-          StackAdjust += AdjInst->getAmount();
-          NewBaseOffset += AdjInst->getAmount();
-          continue;
-        }
-        if (llvm::isa<InstARM32Call>(CurInstr)) {
-          NewBaseOffset -= StackAdjust;
-          StackAdjust = 0;
-          continue;
-        }
-      }
      if (auto *MovInstr = llvm::dyn_cast<InstARM32Mov>(CurInstr)) {
-        legalizeMov(MovInstr, StackAdjust, OrigBaseReg, &NewBaseReg,
+        legalizeMov(MovInstr, OrigBaseReg, &NewBaseReg, &NewBaseOffset);
-                    &NewBaseOffset);
      }
    }
  }
@@ -1269,7 +1306,14 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
    alignRegisterPow2(T, Alignment);
    _sub(SP, SP, T);
  }
-  _mov(Dest, SP);
+  Variable *T = SP;
+  if (MaxOutArgsSizeBytes != 0) {
+    T = makeReg(getPointerType());
+    Operand *OutArgsSizeRF = legalize(
+        Ctx->getConstantInt32(MaxOutArgsSizeBytes), Legal_Reg | Legal_Flex);
+    _add(T, SP, OutArgsSizeRF);
+  }
+  _mov(Dest, T);
 }
 void TargetARM32::div0Check(Type Ty, Operand *SrcLo, Operand *SrcHi) {
@@ -2093,6 +2137,8 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
  }
  case InstArithmetic::Sub: {
    if (Srcs.hasConstOperand()) {
+      // TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed
+      // to be used.
      Variable *Src0R = Srcs.src0R(this);
      if (Srcs.immediateIsFlexEncodable()) {
        Operand *Src1RF = Srcs.src1RF(this);
@@ -2346,7 +2392,7 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
                    TargetARM32::CallingConv::ARM32_MAX_FP_REG_UNITS> FPArgs;
  // Pair of Arg Operand -> stack offset.
  llvm::SmallVector<std::pair<Operand *, int32_t>, 8> StackArgs;
-  int32_t ParameterAreaSizeBytes = 0;
+  size_t ParameterAreaSizeBytes = 0;
  // Classify each argument operand according to the location where the
  // argument is passed.
@@ -2390,16 +2436,8 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
  // the stack is already aligned at the start of the calling sequence.
  ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
-  // Subtract the appropriate amount for the argument area. This also takes
+  if (ParameterAreaSizeBytes > MaxOutArgsSizeBytes) {
-  // care of setting the stack adjustment during emission.
+    llvm::report_fatal_error("MaxOutArgsSizeBytes is not really a max.");
-  //
-  // TODO: If for some reason the call instruction gets dead-code eliminated
-  // after lowering, we would need to ensure that the pre-call and the
-  // post-call esp adjustment get eliminated as well.
-  if (ParameterAreaSizeBytes) {
-    Operand *SubAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),
-                                  Legal_Reg | Legal_Flex);
-    _adjust_stack(ParameterAreaSizeBytes, SubAmount);
  }
  // Copy arguments that are passed on the stack to the appropriate stack
@@ -2492,15 +2530,6 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
  if (ReturnRegHi)
    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
-  // Add the appropriate offset to SP. The call instruction takes care of
-  // resetting the stack offset during emission.
-  if (ParameterAreaSizeBytes) {
-    Operand *AddAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),
-                                  Legal_Reg | Legal_Flex);
-    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
-    _add(SP, SP, AddAmount);
-  }
  // Insert a register-kill pseudo instruction.
  Context.insert(InstFakeKill::create(Func, NewCall));

--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -237,6 +237,7 @@ protected:
  void lowerSwitch(const InstSwitch *Inst) override;
  void lowerUnreachable(const InstUnreachable *Inst) override;
  void prelowerPhis() override;
+  uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override;
  void genTargetHelperCallFor(Inst *Instr) override { (void)Instr; }
  void doAddressOptLoad() override;
  void doAddressOptStore() override;
@@ -289,10 +290,6 @@ protected:
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Adc::create(Func, Dest, Src0, Src1, Pred));
  }
-  void _adjust_stack(int32_t Amount, Operand *SrcAmount) {
-    Context.insert(InstARM32AdjustStack::create(
-        Func, getPhysicalRegister(RegARM32::Reg_sp), Amount, SrcAmount));
-  }
  void _and(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32And::create(Func, Dest, Src0, Src1, Pred));
@@ -813,6 +810,13 @@ protected:
    Context.insert(InstARM32Vsub::create(Func, Dest, Src0, Src1));
  }
+  // Iterates over the CFG and determines the maximum outgoing stack arguments
+  // bytes. This information is later used during addProlog() do pre-allocate
+  // the outargs area.
+  // TODO(jpp): This could live in the Parser, if we provided a Target-specific
+  // method that the Parser could call.
+  void findMaxStackOutArgsSize();
  /// Run a pass through stack variables and ensure that the offsets are legal.
  /// If the offset is not legal, use a new base register that accounts for the
  /// offset, such that the addressing mode offset bits are now legal.
@@ -820,36 +824,35 @@ protected:
  /// Returns true if the given Offset can be represented in a ldr/str.
  bool isLegalMemOffset(Type Ty, int32_t Offset) const;
  // Creates a new Base register centered around
-  // [OrigBaseReg, +/- Offset+StackAdjust].
+  // [OrigBaseReg, +/- Offset].
-  Variable *newBaseRegister(int32_t Offset, int32_t StackAdjust,
+  Variable *newBaseRegister(int32_t Offset, Variable *OrigBaseReg);
-                            Variable *OrigBaseReg);
+  /// Creates a new, legal OperandARM32Mem for accessing OrigBase + Offset. The
-  /// Creates a new, legal OperandARM32Mem for accessing OrigBase + Offset +
+  /// returned mem operand is a legal operand for accessing memory that is of
-  /// StackAdjust. The returned mem operand is a legal operand for accessing
+  /// type Ty.
-  /// memory that is of type Ty.
  ///
-  /// If [OrigBaseReg, #Offset+StackAdjust] is encodable, then the method
+  /// If [OrigBaseReg, #Offset] is encodable, then the method returns a Mem
-  /// returns a Mem operand expressing it. Otherwise,
+  /// operand expressing it. Otherwise,
  ///
-  /// if [*NewBaseReg, #Offset+StackAdjust-*NewBaseOffset] is encodable, the
+  /// if [*NewBaseReg, #Offset-*NewBaseOffset] is encodable, the method will
-  /// method will return that. Otherwise,
+  /// return that. Otherwise,
  ///
-  /// a new base register ip=OrigBaseReg+Offset+StackAdjust is created, and the
+  /// a new base register ip=OrigBaseReg+Offset is created, and the method
-  /// method returns [ip, #0].
+  /// returns [ip, #0].
  OperandARM32Mem *createMemOperand(Type Ty, int32_t Offset,
-                                    int32_t StackAdjust, Variable *OrigBaseReg,
+                                    Variable *OrigBaseReg,
                                    Variable **NewBaseReg,
                                    int32_t *NewBaseOffset);
  /// Legalizes Mov if its Source (or Destination) is a spilled Variable. Moves
  /// to memory become store instructions, and moves from memory, loads.
-  void legalizeMov(InstARM32Mov *Mov, int32_t StackAdjust,
+  void legalizeMov(InstARM32Mov *Mov, Variable *OrigBaseReg,
-                   Variable *OrigBaseReg, Variable **NewBaseReg,
+                   Variable **NewBaseReg, int32_t *NewBaseOffset);
-                   int32_t *NewBaseOffset);
  TargetARM32Features CPUFeatures;
  bool UsesFramePointer = false;
  bool NeedsStackAlignment = false;
  bool MaybeLeafFunc = true;
  size_t SpillAreaSizeBytes = 0;
+  uint32_t MaxOutArgsSizeBytes = 0;
  // TODO(jpp): std::array instead of array.
  static llvm::SmallBitVector TypeToRegisterSet[RCARM32_NUM];
  static llvm::SmallBitVector RegisterAliases[RegARM32::Reg_NUM];

--- a/src/IceTargetLoweringMIPS32.h
+++ b/src/IceTargetLoweringMIPS32.h
@@ -235,6 +235,10 @@ protected:
  void lowerSwitch(const InstSwitch *Inst) override;
  void lowerUnreachable(const InstUnreachable *Inst) override;
  void prelowerPhis() override;
+  uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override {
+    (void)Instr;
+    return 0;
+  }
  void genTargetHelperCallFor(Inst *Instr) override { (void)Instr; }
  void doAddressOptLoad() override;
  void doAddressOptStore() override;

--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -182,6 +182,10 @@ protected:
  void lowerOther(const Inst *Instr) override;
  void lowerRMW(const typename Traits::Insts::FakeRMW *RMW);
  void prelowerPhis() override;
+  uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override {
+    (void)Instr;
+    return 0;
+  }
  void genTargetHelperCallFor(Inst *Instr) override { (void)Instr; }
  void doAddressOptLoad() override;
  void doAddressOptStore() override;

--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -92,25 +92,19 @@ entry:
 ; OPTM1:      call {{.*}} R_{{.*}}    ignore64BitArgNoInline
 ; ARM32-LABEL: pass64BitArg
-; ARM32:      sub     sp, {{.*}} #16
 ; ARM32:      str     {{.*}}, [sp]
 ; ARM32:      movw    r2, #123
 ; ARM32:      bl      {{.*}} ignore64BitArgNoInline
-; ARM32:      add     sp, {{.*}} #16
-; ARM32:      sub     sp, {{.*}} #16
 ; ARM32:      str     {{.*}}, [sp]
 ; ARM32:      {{mov|ldr}} r0
 ; ARM32:      {{mov|ldr}} r1
 ; ARM32:      movw    r2, #123
 ; ARM32:      bl      {{.*}} ignore64BitArgNoInline
-; ARM32:      add     sp, {{.*}} #16
-; ARM32:      sub     sp, {{.*}} #16
 ; ARM32:      str     {{.*}}, [sp]
 ; ARM32:      {{mov|ldr}} r0
 ; ARM32:      {{mov|ldr}} r1
 ; ARM32:      movw    r2, #123
 ; ARM32:      bl      {{.*}} ignore64BitArgNoInline
-; ARM32:      add     sp, {{.*}} #16
 declare i32 @ignore64BitArgNoInline(i64, i32, i64)
@@ -144,7 +138,6 @@ entry:
 ; OPTM1:      call {{.*}} R_{{.*}}    ignore64BitArgNoInline
 ; ARM32-LABEL: pass64BitConstArg
-; ARM32:      sub     sp, {{.*}} #16
 ; ARM32:      movw    [[REG1:r.*]], {{.*}} ; 0xbeef
 ; ARM32:      movt    [[REG1]], {{.*}}     ; 0xdead
 ; ARM32:      movw    [[REG2:r.*]], {{.*}} ; 0x5678
@@ -155,7 +148,6 @@ entry:
 ; ARM32:      {{mov|ldr}} r1
 ; ARM32:      movw    r2, #123
 ; ARM32:      bl      {{.*}} ignore64BitArgNoInline
-; ARM32:      add     sp, {{.*}} #16
 define internal i32 @pass64BitUndefArg() {
 entry:

--- a/tests_lit/llvm2ice_tests/large_stack_offs.ll
+++ b/tests_lit/llvm2ice_tests/large_stack_offs.ll
@@ -49,7 +49,7 @@ end:
 ; ARM32-NOT: mov fp, sp
 ; ARM32: movw ip, #4{{.*}}
 ; ARM32-NEXT: sub sp, sp, ip
-; ARM32: movw ip, #4232
+; ARM32: movw ip, #4248
 ; ARM32-NEXT: add ip, sp, ip
 ; ARM32-NOT: movw ip
 ; %t2 is the result of the "or", and %t2 will be passed via r1 to the call.
@@ -61,14 +61,10 @@ end:
 ; ARM32: str [[REG]], [ip, #-20]
 ; ARM32: b {{[a-f0-9]+}}
 ; Now skip ahead to where the call in br_1 begins, to check how %t2 is used.
-; ARM32: movw ip, #4216
+; ARM32: movw ip, #4232
 ; ARM32-NEXT: add ip, sp, ip
-; ARM32: sub sp, sp, #16
-; Now sp1 = sp0 - 16, but ip is still in terms of sp0.
-; So, sp0 + 4212 == ip - 4.
 ; ARM32: ldr r2, [ip, #-4]
 ; ARM32: bl {{.*}} dummy
-; ARM32: add sp, sp
 ; The call clobbers ip, so we need to re-create the base register.
 ; ARM32: movw ip, #4{{.*}}
 ; ARM32: b {{[a-f0-9]+}}
@@ -122,12 +118,8 @@ end:
 ; Now skip ahead to where the call in br_1 begins, to check how %t2 is used.
 ; ARM32: movw ip, #4120
 ; ARM32-NEXT: sub ip, fp, ip
-; ARM32: sub sp, sp, #16
-; Now sp1 = sp0 - 16, but ip is still in terms of fp0.
-; So, fp0 - 4124 == ip - 4.
 ; ARM32: ldr r2, [ip, #-4]
 ; ARM32: bl {{.*}} dummy
-; ARM32: add sp, sp
 ; The call clobbers ip, so we need to re-create the base register.
 ; ARM32: movw ip, #4{{.*}}
 ; ARM32: b {{[a-f0-9]+}}