Subzero. ARM32. Combine allocas.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1465213002 .

Subzero. ARM32. Combine allocas.
614140e2 · John Porto · fc22f770 · 614140e2 · 614140e2 · 614140e2
Commit 614140e2 authored Nov 23, 2015 by John Porto
6 changed files
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -484,8 +484,14 @@ void Cfg::sortAndCombineAllocas(CfgVector<Inst *> &Allocas,
    } else {
      // Addressing is relative to the stack pointer or to a user pointer.  Add
      // the offset before adding the size of the object, because it grows
-      // upwards from the stack pointer.
-      Offsets.push_back(CurrentOffset);
+      // upwards from the stack pointer. In addition, if the addressing is
+      // relative to the stack pointer, we need to add the pre-computed max out
+      // args size bytes.
+      const uint32_t OutArgsOffsetOrZero =
+          (BaseVariableType == BVT_StackPointer)
+              ? getTarget()->maxOutArgsSizeBytes()
+              : 0;
+      Offsets.push_back(CurrentOffset + OutArgsOffsetOrZero);
    }
    // Update the running offset of the fused alloca region.
    CurrentOffset += Size;

--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -211,6 +211,7 @@ public:
  virtual uint32_t getStackAlignment() const = 0;
  virtual void reserveFixedAllocaArea(size_t Size, size_t Align) = 0;
  virtual int32_t getFrameFixedAllocaOffset() const = 0;
+  virtual uint32_t maxOutArgsSizeBytes() const { return 0; }

  /// Return whether a 64-bit Variable should be split into a Variable64On32.
  virtual bool shouldSplitToVariable64On32(Type Ty) const = 0;

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -265,7 +265,7 @@ uint32_t TargetARM32::getCallStackArgumentsSizeBytes(const InstCall *Call) {
 }

 void TargetARM32::findMaxStackOutArgsSize() {
-  // MinNeededOutArgsBytes should be updated if the Target ever creates an
+  // MinNeededOutArgsBytes should be updated if the Target ever creates a
  // high-level InstCall that requires more stack bytes.
  constexpr size_t MinNeededOutArgsBytes = 0;
  MaxOutArgsSizeBytes = MinNeededOutArgsBytes;
@@ -291,7 +291,7 @@ void TargetARM32::translateO2() {
  findMaxStackOutArgsSize();

  // Do not merge Alloca instructions, and lay out the stack.
-  static constexpr bool SortAndCombineAllocas = false;
+  static constexpr bool SortAndCombineAllocas = true;
  Func->processAllocas(SortAndCombineAllocas);
  Func->dump("After Alloca processing");

@@ -356,6 +356,7 @@ void TargetARM32::translateO2() {
  regAlloc(RAK_Global);
  if (Func->hasError())
    return;
+
  copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
  Func->dump("After linear scan regalloc");

@@ -364,6 +365,8 @@ void TargetARM32::translateO2() {
    Func->dump("After advanced Phi lowering");
  }

+  ForbidTemporaryWithoutReg _(this);
+
  // Stack frame mapping.
  Func->genFrame();
  if (Func->hasError())
@@ -399,8 +402,8 @@ void TargetARM32::translateOm1() {
  findMaxStackOutArgsSize();

  // Do not merge Alloca instructions, and lay out the stack.
-  static constexpr bool SortAndCombineAllocas = false;
-  Func->processAllocas(SortAndCombineAllocas);
+  static constexpr bool DontSortAndCombineAllocas = false;
+  Func->processAllocas(DontSortAndCombineAllocas);
  Func->dump("After Alloca processing");

  Func->placePhiLoads();
@@ -424,9 +427,12 @@ void TargetARM32::translateOm1() {
  regAlloc(RAK_InfOnly);
  if (Func->hasError())
    return;
+
  copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
  Func->dump("After regalloc of infinite-weight variables");

+  ForbidTemporaryWithoutReg _(this);
+
  Func->genFrame();
  if (Func->hasError())
    return;
@@ -520,6 +526,7 @@ void TargetARM32::emitVariable(const Variable *Var) const {
    llvm::report_fatal_error(
        "Infinite-weight Variable has no register assigned");
  }
+  assert(!Var->isRematerializable());
  int32_t Offset = Var->getStackOffset();
  int32_t BaseRegNum = Var->getBaseRegNum();
  if (BaseRegNum == Variable::NoRegister) {
@@ -850,6 +857,9 @@ void TargetARM32::addProlog(CfgNode *Node) {
    SpillAreaSizeBytes = StackSize - StackOffset;
  }

+  // Combine fixed alloca with SpillAreaSize.
+  SpillAreaSizeBytes += FixedAllocaSizeBytes;
+
  // Generate "sub sp, SpillAreaSizeBytes"
  if (SpillAreaSizeBytes) {
    // Use the scratch register if needed to legalize the immediate.
@@ -857,7 +867,11 @@ void TargetARM32::addProlog(CfgNode *Node) {
                                  Legal_Reg | Legal_Flex, getReservedTmpReg());
    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
    _sub(SP, SP, SubAmount);
+    if (FixedAllocaAlignBytes > ARM32_STACK_ALIGNMENT_BYTES) {
+      alignRegisterPow2(SP, FixedAllocaAlignBytes);
+    }
  }
+
  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);

  // Fill in stack offsets for stack args, and copy args into registers for
@@ -1034,6 +1048,7 @@ OperandARM32Mem *TargetARM32::createMemOperand(Type Ty, int32_t Offset,
                                               Variable *OrigBaseReg,
                                               Variable **NewBaseReg,
                                               int32_t *NewBaseOffset) {
+  assert(!OrigBaseReg->isRematerializable());
  if (isLegalMemOffset(Ty, Offset)) {
    return OperandARM32Mem::create(
        Func, Ty, OrigBaseReg,
@@ -1053,6 +1068,7 @@ OperandARM32Mem *TargetARM32::createMemOperand(Type Ty, int32_t Offset,
    OffsetDiff = 0;
  }

+  assert(!(*NewBaseReg)->isRematerializable());
  return OperandARM32Mem::create(
      Func, Ty, *NewBaseReg,
      llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(OffsetDiff)),
@@ -1076,8 +1092,9 @@ void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, Variable *OrigBaseReg,

  bool Legalized = false;
  if (!Dest->hasReg()) {
-    auto *const SrcR = llvm::cast<Variable>(Src);
+    auto *SrcR = llvm::cast<Variable>(Src);
    assert(SrcR->hasReg());
+    assert(!SrcR->isRematerializable());
    const int32_t Offset = Dest->getStackOffset();
    // This is a _mov(Mem(), Variable), i.e., a store.
    _str(SrcR, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
@@ -1087,12 +1104,26 @@ void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, Variable *OrigBaseReg,
    Context.insert(InstFakeDef::create(Func, Dest));
    Legalized = true;
  } else if (auto *Var = llvm::dyn_cast<Variable>(Src)) {
-    if (!Var->hasReg()) {
-      const int32_t Offset = Var->getStackOffset();
-      _ldr(Dest, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
-                                  NewBaseOffset),
-           MovInstr->getPredicate());
+    if (Var->isRematerializable()) {
+      // Rematerialization arithmetic.
+      const int32_t ExtraOffset =
+          (static_cast<SizeT>(Var->getRegNum()) == getFrameReg())
+              ? getFrameFixedAllocaOffset()
+              : 0;
+
+      const int32_t Offset = Var->getStackOffset() + ExtraOffset;
+      Operand *OffsetRF = legalize(Ctx->getConstantInt32(Offset),
+                                   Legal_Reg | Legal_Flex, Dest->getRegNum());
+      _add(Dest, Var, OffsetRF);
      Legalized = true;
+    } else {
+      if (!Var->hasReg()) {
+        const int32_t Offset = Var->getStackOffset();
+        _ldr(Dest, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
+                                    NewBaseOffset),
+             MovInstr->getPredicate());
+        Legalized = true;
+      }
    }
  }

@@ -1163,13 +1194,15 @@ Operand *TargetARM32::loOperand(Operand *Operand) {
    // increment) in case of duplication.
    assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
           Mem->getAddrMode() == OperandARM32Mem::NegOffset);
+    Variable *BaseR = legalizeToReg(Mem->getBase());
    if (Mem->isRegReg()) {
-      return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
-                                     Mem->getIndex(), Mem->getShiftOp(),
-                                     Mem->getShiftAmt(), Mem->getAddrMode());
+      Variable *IndexR = legalizeToReg(Mem->getIndex());
+      return OperandARM32Mem::create(Func, IceType_i32, BaseR, IndexR,
+                                     Mem->getShiftOp(), Mem->getShiftAmt(),
+                                     Mem->getAddrMode());
    } else {
-      return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
-                                     Mem->getOffset(), Mem->getAddrMode());
+      return OperandARM32Mem::create(Func, IceType_i32, BaseR, Mem->getOffset(),
+                                     Mem->getAddrMode());
    }
  }
  llvm_unreachable("Unsupported operand type");
@@ -1201,7 +1234,9 @@ Operand *TargetARM32::hiOperand(Operand *Operand) {
      Variable *NewBase = Func->makeVariable(Base->getType());
      lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add, NewBase,
                                             Base, Four));
-      return OperandARM32Mem::create(Func, SplitType, NewBase, Mem->getIndex(),
+      Variable *BaseR = legalizeToReg(NewBase);
+      Variable *IndexR = legalizeToReg(Mem->getIndex());
+      return OperandARM32Mem::create(Func, SplitType, BaseR, IndexR,
                                     Mem->getShiftOp(), Mem->getShiftAmt(),
                                     Mem->getAddrMode());
    } else {
@@ -1216,16 +1251,17 @@ Operand *TargetARM32::hiOperand(Operand *Operand) {
        // mode into a RegReg addressing mode. Since NaCl sandboxing disallows
        // RegReg addressing modes, prefer adding to base and replacing
        // instead. Thus we leave the old offset alone.
-        Constant *Four = Ctx->getConstantInt32(4);
+        Constant *_4 = Ctx->getConstantInt32(4);
        Variable *NewBase = Func->makeVariable(Base->getType());
        lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add,
-                                               NewBase, Base, Four));
+                                               NewBase, Base, _4));
        Base = NewBase;
      } else {
        Offset =
            llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(NextOffsetVal));
      }
-      return OperandARM32Mem::create(Func, SplitType, Base, Offset,
+      Variable *BaseR = legalizeToReg(Base);
+      return OperandARM32Mem::create(Func, SplitType, BaseR, Offset,
                                     Mem->getAddrMode());
    }
  }
@@ -1264,7 +1300,6 @@ llvm::SmallBitVector TargetARM32::getRegisterSet(RegSetMask Include,
 }

 void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
-  UsesFramePointer = true;
  // Conservatively require the stack to be aligned. Some stack adjustment
  // operations implemented below assume that the stack is aligned before the
  // alloca. All the alloca code ensures that the stack alignment is preserved
@@ -1272,29 +1307,53 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
  // cases.
  NeedsStackAlignment = true;

-  // TODO(stichnot): minimize the number of adjustments of SP, etc.
-  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
-  Variable *Dest = Inst->getDest();
-  uint32_t AlignmentParam = Inst->getAlignInBytes();
  // For default align=0, set it to the real value 1, to avoid any
  // bit-manipulation problems below.
-  AlignmentParam = std::max(AlignmentParam, 1u);
+  const uint32_t AlignmentParam = std::max(1u, Inst->getAlignInBytes());

  // LLVM enforces power of 2 alignment.
  assert(llvm::isPowerOf2_32(AlignmentParam));
  assert(llvm::isPowerOf2_32(ARM32_STACK_ALIGNMENT_BYTES));

-  uint32_t Alignment = std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
-  if (Alignment > ARM32_STACK_ALIGNMENT_BYTES) {
+  const uint32_t Alignment =
+      std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
+  const bool OverAligned = Alignment > ARM32_STACK_ALIGNMENT_BYTES;
+  const bool OptM1 = Ctx->getFlags().getOptLevel() == Opt_m1;
+  const bool AllocaWithKnownOffset = Inst->getKnownFrameOffset();
+  const bool UseFramePointer =
+      hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
+
+  if (UseFramePointer)
+    setHasFramePointer();
+
+  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+  if (OverAligned) {
    alignRegisterPow2(SP, Alignment);
  }
+
+  Variable *Dest = Inst->getDest();
  Operand *TotalSize = Inst->getSizeInBytes();
+
  if (const auto *ConstantTotalSize =
          llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
-    uint32_t Value = ConstantTotalSize->getValue();
-    Value = Utils::applyAlignment(Value, Alignment);
-    Operand *SubAmount = legalize(Ctx->getConstantInt32(Value));
-    _sub(SP, SP, SubAmount);
+    const uint32_t Value =
+        Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
+    // Constant size alloca.
+    if (!UseFramePointer) {
+      // If we don't need a Frame Pointer, this alloca has a known offset to the
+      // stack pointer. We don't need adjust the stack pointer, nor assign any
+      // value to Dest, as Dest is rematerializable.
+      assert(Dest->isRematerializable());
+      FixedAllocaSizeBytes += Value;
+      Context.insert(InstFakeDef::create(Func, Dest));
+      return;
+    }
+
+    // If a frame pointer is required, then we need to store the alloca'd result
+    // in Dest.
+    Operand *SubAmountRF =
+        legalize(Ctx->getConstantInt32(Value), Legal_Reg | Legal_Flex);
+    _sub(SP, SP, SubAmountRF);
  } else {
    // Non-constant sizes need to be adjusted to the next highest multiple of
    // the required alignment at runtime.
@@ -1306,6 +1365,8 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
    alignRegisterPow2(T, Alignment);
    _sub(SP, SP, T);
  }
+
+  // Adds back a few bytes to SP to account for the out args area.
  Variable *T = SP;
  if (MaxOutArgsSizeBytes != 0) {
    T = makeReg(getPointerType());
@@ -1313,6 +1374,7 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
        Ctx->getConstantInt32(MaxOutArgsSizeBytes), Legal_Reg | Legal_Flex);
    _add(T, SP, OutArgsSizeRF);
  }
+
  _mov(Dest, T);
 }

@@ -1976,6 +2038,12 @@ void TargetARM32::lowerInt64Arithmetic(InstArithmetic::OpKind Op,

 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
  Variable *Dest = Inst->getDest();
+
+  if (Dest->isRematerializable()) {
+    Context.insert(InstFakeDef::create(Func, Dest));
+    return;
+  }
+
  if (Dest->getType() == IceType_i1) {
    lowerInt1Arithmetic(Inst);
    return;
@@ -2139,8 +2207,8 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
    if (Srcs.hasConstOperand()) {
      // TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed
      // to be used.
-      Variable *Src0R = Srcs.src0R(this);
      if (Srcs.immediateIsFlexEncodable()) {
+        Variable *Src0R = Srcs.src0R(this);
        Operand *Src1RF = Srcs.src1RF(this);
        if (Srcs.swappedOperands()) {
          _rsb(T, Src0R, Src1RF);
@@ -2151,6 +2219,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
        return;
      }
      if (!Srcs.swappedOperands() && Srcs.negatedImmediateIsFlexEncodable()) {
+        Variable *Src0R = Srcs.src0R(this);
        Operand *Src1F = Srcs.negatedSrc1F(this);
        _add(T, Src0R, Src1F);
        _mov(Dest, T);
@@ -2215,6 +2284,12 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

 void TargetARM32::lowerAssign(const InstAssign *Inst) {
  Variable *Dest = Inst->getDest();
+
+  if (Dest->isRematerializable()) {
+    Context.insert(InstFakeDef::create(Func, Dest));
+    return;
+  }
+
  Operand *Src0 = Inst->getSrc(0);
  assert(Dest->getType() == Src0->getType());
  if (Dest->getType() == IceType_i64) {
@@ -4425,13 +4500,17 @@ OperandARM32Mem *TargetARM32::formAddressingMode(Type Ty, Cfg *Func,
  assert(OffsetImm < 0 ? (ValidImmMask & -OffsetImm) == -OffsetImm
                       : (ValidImmMask & OffsetImm) == OffsetImm);

+  Variable *BaseR = makeReg(getPointerType());
+  Context.insert(InstAssign::create(Func, BaseR, BaseVar));
  if (OffsetReg != nullptr) {
-    return OperandARM32Mem::create(Func, Ty, BaseVar, OffsetReg, ShiftKind,
+    Variable *OffsetR = makeReg(getPointerType());
+    Context.insert(InstAssign::create(Func, OffsetR, OffsetReg));
+    return OperandARM32Mem::create(Func, Ty, BaseR, OffsetR, ShiftKind,
                                   OffsetRegShamt);
  }

  return OperandARM32Mem::create(
-      Func, Ty, BaseVar,
+      Func, Ty, BaseR,
      llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(OffsetImm)));
 }

@@ -4630,7 +4709,8 @@ Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
  if (RegNum == Variable::NoRegister) {
    if (Variable *Subst = getContext().availabilityGet(From)) {
      // At this point we know there is a potential substitution available.
-      if (Subst->mustHaveReg() && !Subst->hasReg()) {
+      if (!Subst->isRematerializable() && Subst->mustHaveReg() &&
+          !Subst->hasReg()) {
        // At this point we know the substitution will have a register.
        if (From->getType() == Subst->getType()) {
          // At this point we know the substitution's register is compatible.
@@ -4788,6 +4868,13 @@ Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
  }

  if (auto *Var = llvm::dyn_cast<Variable>(From)) {
+    if (Var->isRematerializable()) {
+      // TODO(jpp): We don't need to rematerialize Var if legalize() was invoked
+      // for a Variable in a Mem operand.
+      Variable *T = makeReg(Var->getType(), RegNum);
+      _mov(T, Var);
+      return T;
+    }
    // Check if the variable is guaranteed a physical register. This can happen
    // either when the variable is pre-colored or when it is assigned infinite
    // weight.
@@ -4844,9 +4931,9 @@ OperandARM32Mem *TargetARM32::formMemoryOperand(Operand *Operand, Type Ty) {
  // If we didn't do address mode optimization, then we only have a
  // base/offset to work with. ARM always requires a base register, so
  // just use that to hold the operand.
-  Variable *Base = legalizeToReg(Operand);
+  Variable *BaseR = legalizeToReg(Operand);
  return OperandARM32Mem::create(
-      Func, Ty, Base,
+      Func, Ty, BaseR,
      llvm::cast<ConstantInteger32>(Ctx->getConstantZero(IceType_i32)));
 }

@@ -4863,6 +4950,7 @@ Variable64On32 *TargetARM32::makeI64RegPair() {
 Variable *TargetARM32::makeReg(Type Type, int32_t RegNum) {
  // There aren't any 64-bit integer registers for ARM32.
  assert(Type != IceType_i64);
+  assert(AllowTemporaryWithNoReg || RegNum != Variable::NoRegister);
  Variable *Reg = Func->makeVariable(Type);
  if (RegNum == Variable::NoRegister)
    Reg->setMustHaveReg();
@@ -4871,7 +4959,8 @@ Variable *TargetARM32::makeReg(Type Type, int32_t RegNum) {
  return Reg;
 }

-void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {
+void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align,
+                                    int32_t TmpRegNum) {
  assert(llvm::isPowerOf2_32(Align));
  uint32_t RotateAmt;
  uint32_t Immed_8;
@@ -4880,10 +4969,12 @@ void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {
  // it fits at all). Assume Align is usually small, in which case BIC works
  // better. Thus, this rounds down to the alignment.
  if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {
-    Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex);
+    Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex,
+                    TmpRegNum);
    _bic(Reg, Reg, Mask);
  } else {
-    Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex);
+    Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex,
+                    TmpRegNum);
    _and(Reg, Reg, Mask);
  }
 }

--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -99,16 +99,15 @@ public:
  }
  uint32_t getStackAlignment() const override;
  void reserveFixedAllocaArea(size_t Size, size_t Align) override {
-    // TODO(sehr,jpp): Implement fixed stack layout.
-    (void)Size;
-    (void)Align;
-    llvm::report_fatal_error("Not yet implemented");
+    FixedAllocaSizeBytes = Size;
+    assert(llvm::isPowerOf2_32(Align));
+    FixedAllocaAlignBytes = Align;
+    PrologEmitsFixedAllocas = true;
  }
  int32_t getFrameFixedAllocaOffset() const override {
-    // TODO(sehr,jpp): Implement fixed stack layout.
-    llvm::report_fatal_error("Not yet implemented");
-    return 0;
+    return FixedAllocaSizeBytes - (SpillAreaSizeBytes - MaxOutArgsSizeBytes);
  }
+  uint32_t maxOutArgsSizeBytes() const override { return MaxOutArgsSizeBytes; }

  bool shouldSplitToVariable64On32(Type Ty) const override {
    return Ty == IceType_i64;
@@ -250,7 +249,8 @@ protected:
  Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
  static Type stackSlotType();
  Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
-  void alignRegisterPow2(Variable *Reg, uint32_t Align);
+  void alignRegisterPow2(Variable *Reg, uint32_t Align,
+                         int32_t TmpRegNum = Variable::NoRegister);

  /// Returns a vector in a register with the given constant entries.
  Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
@@ -811,7 +811,7 @@ protected:
  }

  // Iterates over the CFG and determines the maximum outgoing stack arguments
-  // bytes. This information is later used during addProlog() do pre-allocate
+  // bytes. This information is later used during addProlog() to pre-allocate
  // the outargs area.
  // TODO(jpp): This could live in the Parser, if we provided a Target-specific
  // method that the Parser could call.
@@ -852,6 +852,9 @@ protected:
  bool NeedsStackAlignment = false;
  bool MaybeLeafFunc = true;
  size_t SpillAreaSizeBytes = 0;
+  size_t FixedAllocaSizeBytes = 0;
+  size_t FixedAllocaAlignBytes = 0;
+  bool PrologEmitsFixedAllocas = false;
  uint32_t MaxOutArgsSizeBytes = 0;
  // TODO(jpp): std::array instead of array.
  static llvm::SmallBitVector TypeToRegisterSet[RCARM32_NUM];
@@ -970,6 +973,29 @@ private:
  };

  BoolComputationTracker BoolComputations;
+
+  // AllowTemporaryWithNoReg indicates if TargetARM32::makeReg() can be invoked
+  // without specifying a physical register. This is needed for creating unbound
+  // temporaries during Ice -> ARM lowering, but before register allocation.
+  // This a safe-guard that, during the legalization post-passes no unbound
+  // temporaries are created.
+  bool AllowTemporaryWithNoReg = true;
+  // ForbidTemporaryWithoutReg is a RAII class that manages
+  // AllowTemporaryWithNoReg.
+  class ForbidTemporaryWithoutReg {
+    ForbidTemporaryWithoutReg() = delete;
+    ForbidTemporaryWithoutReg(const ForbidTemporaryWithoutReg&) = delete;
+    ForbidTemporaryWithoutReg &operator=(const ForbidTemporaryWithoutReg&) = delete;
+
+  public:
+    explicit ForbidTemporaryWithoutReg(TargetARM32 *Target) : Target(Target) {
+      Target->AllowTemporaryWithNoReg = false;
+    }
+    ~ForbidTemporaryWithoutReg() { Target->AllowTemporaryWithNoReg = true; }
+
+  private:
+    TargetARM32 *const Target;
+  };
 };

 class TargetDataARM32 final : public TargetDataLowering {

--- a/tests_lit/assembler/arm32/bic.ll
+++ b/tests_lit/assembler/arm32/bic.ll
@@ -28,70 +28,65 @@ define internal i32 @AllocBigAlign() {

 ; ASM-LABEL:AllocBigAlign:
 ; ASM-NEXT:.LAllocBigAlign$__0:
-; ASM-NEXT:  push    {fp}
-; ASM-NEXT:  mov     fp, sp
-; ASM-NEXT:  sub     sp, sp, #12
-; ASM-NEXT:  bic     sp, sp, #31
-; ASM-NEXT:  sub     sp, sp, #32
-; ASM-NEXT:  mov     r0, sp
-; ASM-NEXT:  mov     sp, fp
-; ASM-NEXT:  pop     {fp}
-; ASM-NEXT:  # fp = def.pseudo 
-; ASM-NEXT:  bx      lr
-
 ; DIS-LABEL:00000000 <AllocBigAlign>:
-; DIS-NEXT:   0:        e52db004
-; DIS-NEXT:   4:        e1a0b00d
-; DIS-NEXT:   8:        e24dd00c
-; DIS-NEXT:   c:        e3cdd01f
-; DIS-NEXT:  10:        e24dd020
-; DIS-NEXT:  14:        e1a0000d
-; DIS-NEXT:  18:        e1a0d00b
-; DIS-NEXT:  1c:        e49db004
-; DIS-NEXT:  20:        e12fff1e
-
 ; IASM-LABEL:AllocBigAlign:
 ; IASM-NEXT:.LAllocBigAlign$__0:
+
+; ASM-NEXT:  push    {fp}
+; DIS-NEXT:   0:        e52db004
 ; IASM-NEXT:    .byte 0x4
 ; IASM-NEXT:    .byte 0xb0
 ; IASM-NEXT:    .byte 0x2d
 ; IASM-NEXT:    .byte 0xe5

+; ASM-NEXT:  mov     fp, sp
+; DIS-NEXT:   4:        e1a0b00d
 ; IASM:         .byte 0xd
 ; IASM-NEXT:    .byte 0xb0
 ; IASM-NEXT:    .byte 0xa0
 ; IASM-NEXT:    .byte 0xe1

-; IASM:         .byte 0xc
+; ASM-NEXT:  sub     sp, sp, #32
+; DIS-NEXT:   8:        e24dd020
+; IASM:         .byte 0x20
 ; IASM-NEXT:    .byte 0xd0
 ; IASM-NEXT:    .byte 0x4d
 ; IASM-NEXT:    .byte 0xe2

+; ASM-NEXT:  bic     sp, sp, #31
+; DIS-NEXT:   c:        e3cdd01f
 ; IASM:         .byte 0x1f
 ; IASM-NEXT:    .byte 0xd0
 ; IASM-NEXT:    .byte 0xcd
 ; IASM-NEXT:    .byte 0xe3

-; IASM:         .byte 0x20
-; IASM-NEXT:    .byte 0xd0
-; IASM-NEXT:    .byte 0x4d
-; IASM-NEXT:    .byte 0xe2
+; ASM-NEXT:  # sp = def.pseudo 

-; IASM:         .byte 0xd
+; ASM-NEXT:  add     r0, sp, #0
+; DIS-NEXT:  10:        e28d0000
+; IASM:         .byte 0x0
 ; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0xa0
-; IASM-NEXT:    .byte 0xe1
+; IASM-NEXT:    .byte 0x8d
+; IASM-NEXT:    .byte 0xe2

+; ASM-NEXT:  mov     sp, fp
+; DIS-NEXT:  14:        e1a0d00b
 ; IASM:         .byte 0xb
 ; IASM-NEXT:    .byte 0xd0
 ; IASM-NEXT:    .byte 0xa0
 ; IASM-NEXT:    .byte 0xe1

+; ASM-NEXT:  pop     {fp}
+; DIS-NEXT:  18:        e49db004
 ; IASM-NEXT:    .byte 0x4
 ; IASM-NEXT:    .byte 0xb0
 ; IASM-NEXT:    .byte 0x9d
 ; IASM-NEXT:    .byte 0xe4

+; ASM-NEXT:  # fp = def.pseudo 
+
+; ASM-NEXT:  bx      lr
+; DIS-NEXT:  1c:        e12fff1e
 ; IASM:         .byte 0x1e
 ; IASM-NEXT:    .byte 0xff
 ; IASM-NEXT:    .byte 0x2f

--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
@@ -17,14 +17,14 @@
 ; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
 ; RUN:   -allow-externally-defined-symbols \
 ; RUN:   | %if --need=target_ARM32 --need=allow_dump \
-; RUN:   --command FileCheck --check-prefix ARM32 %s
+; RUN:   --command FileCheck --check-prefix ARM32 --check-prefix=ARM-OPT2 %s

 ; RUN: %if --need=target_ARM32 --need=allow_dump \
 ; RUN:   --command %p2i --filetype=asm --assemble \
 ; RUN:   --disassemble --target arm32 -i %s --args -Om1 --skip-unimplemented \
 ; RUN:   -allow-externally-defined-symbols \
 ; RUN:   | %if --need=target_ARM32 --need=allow_dump \
-; RUN:   --command FileCheck --check-prefix ARM32 %s
+; RUN:   --command FileCheck --check-prefix ARM32 --check-prefix=ARM-OPTM1 %s

 define internal void @fixed_416_align_16(i32 %n) {
 entry:
@@ -47,8 +47,9 @@ entry:
 ; CHECK-OPTM1:      call {{.*}} R_{{.*}}    f1

 ; ARM32-LABEL: fixed_416_align_16
-; ARM32:      sub sp, sp, #416
-; ARM32:      bl {{.*}} R_{{.*}}    f1
+; ARM32-OPT2:  sub sp, sp, #428
+; ARM32-OPTM1: sub sp, sp, #416
+; ARM32:       bl {{.*}} R_{{.*}}    f1

 define internal void @fixed_416_align_32(i32 %n) {
 entry:
@@ -67,9 +68,10 @@ entry:
 ; CHECK:      call {{.*}} R_{{.*}}    f1

 ; ARM32-LABEL: fixed_416_align_32
-; ARM32:      bic sp, sp, #31
-; ARM32:      sub sp, sp, #416
-; ARM32:      bl {{.*}} R_{{.*}}    f1
+; ARM32-OPT2:  sub sp, sp, #424
+; ARM32-OPTM1: sub sp, sp, #416
+; ARM32:       bic sp, sp, #31
+; ARM32:       bl {{.*}} R_{{.*}}    f1

 ; Show that the amount to allocate will be rounded up.
 define internal void @fixed_351_align_16(i32 %n) {
@@ -91,8 +93,9 @@ entry:
 ; CHECK-OPTM1:      call {{.*}} R_{{.*}}    f1

 ; ARM32-LABEL: fixed_351_align_16
-; ARM32:      sub sp, sp, #352
-; ARM32:      bl {{.*}} R_{{.*}}    f1
+; ARM32-OPT2:  sub sp, sp, #364
+; ARM32-OPTM1: sub sp, sp, #352
+; ARM32:       bl {{.*}} R_{{.*}}    f1

 define internal void @fixed_351_align_32(i32 %n) {
 entry:
@@ -111,9 +114,10 @@ entry:
 ; CHECK:      call {{.*}} R_{{.*}}    f1

 ; ARM32-LABEL: fixed_351_align_32
-; ARM32:      bic sp, sp, #31
-; ARM32:      sub sp, sp, #352
-; ARM32:      bl {{.*}} R_{{.*}}    f1
+; ARM32-OPT2:  sub sp, sp, #360
+; ARM32-OPTM1: sub sp, sp, #352
+; ARM32:       bic sp, sp, #31
+; ARM32:       bl {{.*}} R_{{.*}}    f1

 declare void @f1(i32 %ignored)