Subzero ARM: addProlog/addEpilogue -- share some code with x86.

Split out some of the addProlog code from x86 and reuse that for ARM. Mainly, the code that doesn't concern preserved registers or stack arguments is split out. ARM push and pop take a whole list of registers (not necessarily consecutive, but should be in ascending order). There is also "vpush" for callee-saved float/vector registers but we do not handle that yet (the register numbers for that have to be consecutive). Enable some of the int-arg.ll tests, which relied on addPrologue's finishArgumentLowering to pull from the correct argument stack slot. Test some of the frame pointer usage (push/pop) when handling a variable sized alloca. Also change the classification of LR, and PC so that they are not "CalleeSave". We don't want to push LR if it isn't overwritten by another call. It will certainly be "used" by the return however. The prologue code only checks if a CalleeSave register is used somewhere before deciding to preserve it. We could make that stricter and check if the register is also written to, but there are some additional writes that are not visible till after the push/pop are generated (e.g., copy from argument stack slot to the argument register). Instead, keep checking use only, and handle LR as a special case (IsLeafFunction). BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1159013002

Subzero ARM: addProlog/addEpilogue -- share some code with x86.
0fa6c5a0 · Jan Voung · c77f817f · 0fa6c5a0 · 0fa6c5a0 · 0fa6c5a0
Commit 0fa6c5a0 authored Jun 01, 2015 by Jan Voung
12 changed files
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -260,6 +260,20 @@ InstARM32Mla::InstARM32Mla(Cfg *Func, Variable *Dest, Variable *Src0,
  addSource(Acc);
 }

+InstARM32Pop::InstARM32Pop(Cfg *Func, const VarList &Dests)
+    : InstARM32(Func, InstARM32::Pop, 0, nullptr), Dests(Dests) {
+  // Track modifications to Dests separately via FakeDefs.
+  // Also, a pop instruction affects the stack pointer and so it should not
+  // be allowed to be automatically dead-code eliminated. This is automatic
+  // since we leave the Dest as nullptr.
+}
+
+InstARM32Push::InstARM32Push(Cfg *Func, const VarList &Srcs)
+    : InstARM32(Func, InstARM32::Push, Srcs.size(), nullptr) {
+  for (Variable *Source : Srcs)
+    addSource(Source);
+}
+
 InstARM32Ret::InstARM32Ret(Cfg *Func, Variable *LR, Variable *Source)
    : InstARM32(Func, InstARM32::Ret, Source ? 2 : 1, nullptr) {
  addSource(LR);
@@ -554,6 +568,66 @@ template <> void InstARM32Movt::emit(const Cfg *Func) const {
  }
 }

+void InstARM32Pop::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  assert(Dests.size() > 0);
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+      << "pop"
+      << "\t{";
+  for (SizeT I = 0; I < Dests.size(); ++I) {
+    if (I > 0)
+      Str << ", ";
+    Dests[I]->emit(Func);
+  }
+  Str << "}";
+}
+
+void InstARM32Pop::emitIAS(const Cfg *Func) const {
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Pop::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "pop"
+      << " ";
+  for (SizeT I = 0; I < Dests.size(); ++I) {
+    if (I > 0)
+      Str << ", ";
+    Dests[I]->dump(Func);
+  }
+}
+
+void InstARM32Push::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  assert(getSrcSize() > 0);
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+      << "push"
+      << "\t{";
+  emitSources(Func);
+  Str << "}";
+}
+
+void InstARM32Push::emitIAS(const Cfg *Func) const {
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Push::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "push"
+      << " ";
+  dumpSources(Func);
+}
+
 void InstARM32Ret::emit(const Cfg *Func) const {
  if (!ALLOW_DUMP)
    return;
@@ -683,7 +757,7 @@ void OperandARM32Mem::dump(const Cfg *Func, Ostream &Str) const {
  } else {
    getOffset()->dump(Func, Str);
  }
-  Str << "] AddrMode==" << getAddrMode() << "\n";
+  Str << "] AddrMode==" << getAddrMode();
 }

 void OperandARM32FlexImm::emit(const Cfg *Func) const {

--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -15,9 +15,14 @@
 #define SUBZERO_SRC_ICEINSTARM32_DEF

 // NOTE: PC and SP are not considered isInt, to avoid register allocating.
+//
 // For the NaCl sandbox we also need to r9 for TLS, so just reserve always.
 // TODO(jvoung): Allow r9 to be isInt when sandboxing is turned off
 // (native mode).
+//
+// LR is not considered isInt to avoid being allocated as a register.
+// It is technically preserved, but save/restore is handled separately,
+// based on whether or not the function MaybeLeafFunc.
 #define REGARM32_GPR_TABLE                                                     \
  /* val, encode, name, scratch, preserved, stackptr, frameptr, isInt, isFP */ \
  X(Reg_r0,  = 0,            "r0",  1, 0, 0, 0, 1, 0)                   \
@@ -33,9 +38,9 @@
  X(Reg_r10, = Reg_r0 + 10,  "r10", 0, 1, 0, 0, 1, 0)                   \
  X(Reg_fp,  = Reg_r0 + 11,  "fp",  0, 1, 0, 1, 1, 0)                   \
  X(Reg_ip,  = Reg_r0 + 12,  "ip",  1, 0, 0, 0, 1, 0)                   \
-  X(Reg_sp,  = Reg_r0 + 13,  "sp",  0, 1, 1, 0, 0, 0)                   \
-  X(Reg_lr,  = Reg_r0 + 14,  "lr",  0, 1, 0, 0, 1, 0)                   \
-  X(Reg_pc,  = Reg_r0 + 15,  "pc",  0, 1, 0, 0, 0, 0)                   \
+  X(Reg_sp,  = Reg_r0 + 13,  "sp",  0, 0, 1, 0, 0, 0)                   \
+  X(Reg_lr,  = Reg_r0 + 14,  "lr",  0, 0, 0, 0, 0, 0)                   \
+  X(Reg_pc,  = Reg_r0 + 15,  "pc",  0, 0, 0, 0, 0, 0)                   \
 //#define X(val, encode, name, scratch, preserved, stackptr, frameptr,
 //          isInt, isFP)


--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -113,7 +113,14 @@ public:
  AddrMode getAddrMode() const { return Mode; }

  bool isRegReg() const { return Index != nullptr; }
-  bool isNegAddrMode() const { return Mode >= NegOffset; }
+  bool isNegAddrMode() const {
+    // Positive address modes have the "U" bit set, and negative modes don't.
+    static_assert((PreIndex & (4 << 21)) != 0,
+                  "Positive addr modes should have U bit set.");
+    static_assert((NegPreIndex & (4 << 21)) == 0,
+                  "Negative addr modes should have U bit clear.");
+    return (Mode & (4 << 21)) == 0;
+  }

  void emit(const Cfg *Func) const override;
  using OperandARM32::dump;
@@ -266,6 +273,8 @@ public:
    Mul,
    Mvn,
    Orr,
+    Pop,
+    Push,
    Ret,
    Sbc,
    Sub,
@@ -682,6 +691,49 @@ private:
  ~InstARM32Mla() override {}
 };

+// Pop into a list of GPRs. Technically this can be predicated, but we don't
+// need that functionality.
+class InstARM32Pop : public InstARM32 {
+  InstARM32Pop() = delete;
+  InstARM32Pop(const InstARM32Pop &) = delete;
+  InstARM32Pop &operator=(const InstARM32Pop &) = delete;
+
+public:
+  static InstARM32Pop *create(Cfg *Func, const VarList &Dests) {
+    return new (Func->allocate<InstARM32Pop>()) InstARM32Pop(Func, Dests);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Pop); }
+
+private:
+  InstARM32Pop(Cfg *Func, const VarList &Dests);
+  ~InstARM32Pop() override {}
+  VarList Dests;
+};
+
+// Push a list of GPRs. Technically this can be predicated, but we don't
+// need that functionality.
+class InstARM32Push : public InstARM32 {
+  InstARM32Push() = delete;
+  InstARM32Push(const InstARM32Push &) = delete;
+  InstARM32Push &operator=(const InstARM32Push &) = delete;
+
+public:
+  static InstARM32Push *create(Cfg *Func, const VarList &Srcs) {
+    return new (Func->allocate<InstARM32Push>()) InstARM32Push(Func, Srcs);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Push); }
+
+private:
+  InstARM32Push(Cfg *Func, const VarList &Srcs);
+  ~InstARM32Push() override {}
+};
+
 // Ret pseudo-instruction.  This is actually a "bx" instruction with
 // an "lr" register operand, but epilogue lowering will search for a Ret
 // instead of a generic "bx". This instruction also takes a Source

--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -244,6 +244,159 @@ void TargetLowering::inferTwoAddress() {
  }
 }

+void TargetLowering::sortVarsByAlignment(VarList &Dest,
+                                         const VarList &Source) const {
+  Dest = Source;
+  // Instead of std::sort, we could do a bucket sort with log2(alignment)
+  // as the buckets, if performance is an issue.
+  std::sort(Dest.begin(), Dest.end(),
+            [this](const Variable *V1, const Variable *V2) {
+              return typeWidthInBytesOnStack(V1->getType()) >
+                  typeWidthInBytesOnStack(V2->getType());
+            });
+}
+
+void TargetLowering::getVarStackSlotParams(
+    VarList &SortedSpilledVariables, llvm::SmallBitVector &RegsUsed,
+    size_t *GlobalsSize, size_t *SpillAreaSizeBytes,
+    uint32_t *SpillAreaAlignmentBytes, uint32_t *LocalsSlotsAlignmentBytes,
+    std::function<bool(Variable *)> TargetVarHook) {
+  const VariablesMetadata *VMetadata = Func->getVMetadata();
+  llvm::BitVector IsVarReferenced(Func->getNumVariables());
+  for (CfgNode *Node : Func->getNodes()) {
+    for (Inst &Inst : Node->getInsts()) {
+      if (Inst.isDeleted())
+        continue;
+      if (const Variable *Var = Inst.getDest())
+        IsVarReferenced[Var->getIndex()] = true;
+      for (SizeT I = 0; I < Inst.getSrcSize(); ++I) {
+        Operand *Src = Inst.getSrc(I);
+        SizeT NumVars = Src->getNumVars();
+        for (SizeT J = 0; J < NumVars; ++J) {
+          const Variable *Var = Src->getVar(J);
+          IsVarReferenced[Var->getIndex()] = true;
+        }
+      }
+    }
+  }
+
+  // If SimpleCoalescing is false, each variable without a register
+  // gets its own unique stack slot, which leads to large stack
+  // frames.  If SimpleCoalescing is true, then each "global" variable
+  // without a register gets its own slot, but "local" variable slots
+  // are reused across basic blocks.  E.g., if A and B are local to
+  // block 1 and C is local to block 2, then C may share a slot with A or B.
+  //
+  // We cannot coalesce stack slots if this function calls a "returns twice"
+  // function. In that case, basic blocks may be revisited, and variables
+  // local to those basic blocks are actually live until after the
+  // called function returns a second time.
+  const bool SimpleCoalescing = !callsReturnsTwice();
+
+  std::vector<size_t> LocalsSize(Func->getNumNodes());
+  const VarList &Variables = Func->getVariables();
+  VarList SpilledVariables;
+  for (Variable *Var : Variables) {
+    if (Var->hasReg()) {
+      RegsUsed[Var->getRegNum()] = true;
+      continue;
+    }
+    // An argument either does not need a stack slot (if passed in a
+    // register) or already has one (if passed on the stack).
+    if (Var->getIsArg())
+      continue;
+    // An unreferenced variable doesn't need a stack slot.
+    if (!IsVarReferenced[Var->getIndex()])
+      continue;
+    // Check a target-specific variable (it may end up sharing stack slots)
+    // and not need accounting here.
+    if (TargetVarHook(Var))
+      continue;
+    SpilledVariables.push_back(Var);
+  }
+
+  SortedSpilledVariables.reserve(SpilledVariables.size());
+  sortVarsByAlignment(SortedSpilledVariables, SpilledVariables);
+
+  for (Variable *Var : SortedSpilledVariables) {
+    size_t Increment = typeWidthInBytesOnStack(Var->getType());
+    // We have sorted by alignment, so the first variable we encounter that
+    // is located in each area determines the max alignment for the area.
+    if (!*SpillAreaAlignmentBytes)
+      *SpillAreaAlignmentBytes = Increment;
+    if (SimpleCoalescing && VMetadata->isTracked(Var)) {
+      if (VMetadata->isMultiBlock(Var)) {
+        *GlobalsSize += Increment;
+      } else {
+        SizeT NodeIndex = VMetadata->getLocalUseNode(Var)->getIndex();
+        LocalsSize[NodeIndex] += Increment;
+        if (LocalsSize[NodeIndex] > *SpillAreaSizeBytes)
+          *SpillAreaSizeBytes = LocalsSize[NodeIndex];
+        if (!*LocalsSlotsAlignmentBytes)
+          *LocalsSlotsAlignmentBytes = Increment;
+      }
+    } else {
+      *SpillAreaSizeBytes += Increment;
+    }
+  }
+}
+
+void TargetLowering::alignStackSpillAreas(uint32_t SpillAreaStartOffset,
+                                          uint32_t SpillAreaAlignmentBytes,
+                                          size_t GlobalsSize,
+                                          uint32_t LocalsSlotsAlignmentBytes,
+                                          uint32_t *SpillAreaPaddingBytes,
+                                          uint32_t *LocalsSlotsPaddingBytes) {
+  if (SpillAreaAlignmentBytes) {
+    uint32_t PaddingStart = SpillAreaStartOffset;
+    uint32_t SpillAreaStart =
+        Utils::applyAlignment(PaddingStart, SpillAreaAlignmentBytes);
+    *SpillAreaPaddingBytes = SpillAreaStart - PaddingStart;
+  }
+
+  // If there are separate globals and locals areas, make sure the
+  // locals area is aligned by padding the end of the globals area.
+  if (LocalsSlotsAlignmentBytes) {
+    uint32_t GlobalsAndSubsequentPaddingSize = GlobalsSize;
+    GlobalsAndSubsequentPaddingSize =
+        Utils::applyAlignment(GlobalsSize, LocalsSlotsAlignmentBytes);
+    *LocalsSlotsPaddingBytes = GlobalsAndSubsequentPaddingSize - GlobalsSize;
+  }
+}
+
+void TargetLowering::assignVarStackSlots(VarList &SortedSpilledVariables,
+                                         size_t SpillAreaPaddingBytes,
+                                         size_t SpillAreaSizeBytes,
+                                         size_t GlobalsAndSubsequentPaddingSize,
+                                         bool UsesFramePointer) {
+  const VariablesMetadata *VMetadata = Func->getVMetadata();
+  size_t GlobalsSpaceUsed = SpillAreaPaddingBytes;
+  size_t NextStackOffset = SpillAreaPaddingBytes;
+  std::vector<size_t> LocalsSize(Func->getNumNodes());
+  const bool SimpleCoalescing = !callsReturnsTwice();
+  for (Variable *Var : SortedSpilledVariables) {
+    size_t Increment = typeWidthInBytesOnStack(Var->getType());
+    if (SimpleCoalescing && VMetadata->isTracked(Var)) {
+      if (VMetadata->isMultiBlock(Var)) {
+        GlobalsSpaceUsed += Increment;
+        NextStackOffset = GlobalsSpaceUsed;
+      } else {
+        SizeT NodeIndex = VMetadata->getLocalUseNode(Var)->getIndex();
+        LocalsSize[NodeIndex] += Increment;
+        NextStackOffset = SpillAreaPaddingBytes +
+                          GlobalsAndSubsequentPaddingSize +
+                          LocalsSize[NodeIndex];
+      }
+    } else {
+      NextStackOffset += Increment;
+    }
+    if (UsesFramePointer)
+      Var->setStackOffset(-NextStackOffset);
+    else
+      Var->setStackOffset(SpillAreaSizeBytes - NextStackOffset);
+  }
+}
+
 InstCall *TargetLowering::makeHelperCall(const IceString &Name, Variable *Dest,
                                         SizeT MaxSrcs) {
  const bool HasTailCall = false;

--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -165,6 +165,7 @@ public:
  virtual bool hasFramePointer() const { return false; }
  virtual SizeT getFrameOrStackReg() const = 0;
  virtual size_t typeWidthInBytesOnStack(Type Ty) const = 0;
+
  bool hasComputedFrame() const { return HasComputedFrame; }
  // Returns true if this function calls a function that has the
  // "returns twice" attribute.
@@ -259,10 +260,66 @@ protected:
  // to keep liveness analysis consistent.
  void inferTwoAddress();

+  // Make a pass over the Cfg to determine which variables need stack slots
+  // and place them in a sorted list (SortedSpilledVariables). Among those,
+  // vars, classify the spill variables as local to the basic block vs
+  // global (multi-block) in order to compute the parameters GlobalsSize
+  // and SpillAreaSizeBytes (represents locals or general vars if the
+  // coalescing of locals is disallowed) along with alignments required
+  // for variables in each area. We rely on accurate VMetadata in order to
+  // classify a variable as global vs local (otherwise the variable is
+  // conservatively global). The in-args should be initialized to 0.
+  //
+  // This is only a pre-pass and the actual stack slot assignment is
+  // handled separately.
+  //
+  // There may be target-specific Variable types, which will be handled
+  // by TargetVarHook. If the TargetVarHook returns true, then the variable
+  // is skipped and not considered with the rest of the spilled variables.
+  void getVarStackSlotParams(VarList &SortedSpilledVariables,
+                             llvm::SmallBitVector &RegsUsed,
+                             size_t *GlobalsSize, size_t *SpillAreaSizeBytes,
+                             uint32_t *SpillAreaAlignmentBytes,
+                             uint32_t *LocalsSlotsAlignmentBytes,
+                             std::function<bool(Variable *)> TargetVarHook);
+
+  // Calculate the amount of padding needed to align the local and global
+  // areas to the required alignment.  This assumes the globals/locals layout
+  // used by getVarStackSlotParams and assignVarStackSlots.
+  void alignStackSpillAreas(uint32_t SpillAreaStartOffset,
+                            uint32_t SpillAreaAlignmentBytes,
+                            size_t GlobalsSize,
+                            uint32_t LocalsSlotsAlignmentBytes,
+                            uint32_t *SpillAreaPaddingBytes,
+                            uint32_t *LocalsSlotsPaddingBytes);
+
+  // Make a pass through the SortedSpilledVariables and actually assign
+  // stack slots. SpillAreaPaddingBytes takes into account stack alignment
+  // padding. The SpillArea starts after that amount of padding.
+  // This matches the scheme in getVarStackSlotParams, where there may
+  // be a separate multi-block global var spill area and a local var
+  // spill area.
+  void assignVarStackSlots(VarList &SortedSpilledVariables,
+                           size_t SpillAreaPaddingBytes,
+                           size_t SpillAreaSizeBytes,
+                           size_t GlobalsAndSubsequentPaddingSize,
+                           bool UsesFramePointer);
+
+  // Sort the variables in Source based on required alignment.
+  // The variables with the largest alignment need are placed in the front
+  // of the Dest list.
+  void sortVarsByAlignment(VarList &Dest, const VarList &Source) const;
+
  // Make a call to an external helper function.
  InstCall *makeHelperCall(const IceString &Name, Variable *Dest,
                           SizeT MaxSrcs);

+  void
+  _bundle_lock(InstBundleLock::Option BundleOption = InstBundleLock::Opt_None) {
+    Context.insert(InstBundleLock::create(Func, BundleOption));
+  }
+  void _bundle_unlock() { Context.insert(InstBundleUnlock::create(Func)); }
+
  Cfg *Func;
  GlobalContext *Ctx;
  bool HasComputedFrame;

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -52,6 +52,7 @@ public:
    // i8, and i16 are rounded up to 4 bytes.
    return (typeWidthInBytes(Ty) + 3) & ~3;
  }
+
  void emitVariable(const Variable *Var) const override;

  const char *getConstantPrefix() const final { return "#"; }
@@ -71,6 +72,8 @@ public:
  void split64(Variable *Var);
  Operand *loOperand(Operand *Operand);
  Operand *hiOperand(Operand *Operand);
+  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);

 protected:
  explicit TargetARM32(Cfg *Func);
@@ -219,6 +222,15 @@ protected:
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Orr::create(Func, Dest, Src0, Src1, Pred));
  }
+  void _push(const VarList &Sources) {
+    Context.insert(InstARM32Push::create(Func, Sources));
+  }
+  void _pop(const VarList &Dests) {
+    Context.insert(InstARM32Pop::create(Func, Dests));
+    // Mark dests as modified.
+    for (Variable *Dest : Dests)
+      Context.insert(InstFakeDef::create(Func, Dest));
+  }
  void _sbc(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Sbc::create(Func, Dest, Src0, Src1, Pred));
@@ -253,6 +265,8 @@ protected:

  bool UsesFramePointer;
  bool NeedsStackAlignment;
+  bool MaybeLeafFunc;
+  size_t SpillAreaSizeBytes;
  llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
  llvm::SmallBitVector ScratchRegs;
  llvm::SmallBitVector RegsUsed;

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -120,6 +120,7 @@ public:
    // i8, and i16 are rounded up to 4 bytes.
    return (typeWidthInBytes(Ty) + 3) & ~3;
  }
+
  void emitVariable(const Variable *Var) const override;

  const char *getConstantPrefix() const final { return "$"; }
@@ -139,10 +140,10 @@ public:
  // function calls using the 32-bit push instruction (though the
  // latter could be done by directly writing to the stack).
  void split64(Variable *Var);
-  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
-                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);
  Operand *loOperand(Operand *Operand);
  Operand *hiOperand(Operand *Operand);
+  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);
  X8632::Address stackVarToAsmOperand(const Variable *Var) const;

  enum X86InstructionSet {
@@ -205,8 +206,6 @@ protected:
  void scalarizeArithmetic(InstArithmetic::OpKind K, Variable *Dest,
                           Operand *Src0, Operand *Src1);

-  void sortByAlignment(VarList &Dest, const VarList &Source) const;
-
  // Operand legalization helpers.  To deal with address mode
  // constraints, the helpers will create a new Operand and emit
  // instructions that guarantee that the Operand kind is one of those
@@ -303,11 +302,6 @@ protected:
  void _bswap(Variable *SrcDest) {
    Context.insert(InstX8632Bswap::create(Func, SrcDest));
  }
-  void
-  _bundle_lock(InstBundleLock::Option BundleOption = InstBundleLock::Opt_None) {
-    Context.insert(InstBundleLock::create(Func, BundleOption));
-  }
-  void _bundle_unlock() { Context.insert(InstBundleUnlock::create(Func)); }
  void _cbwdq(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Cbwdq::create(Func, Dest, Src0));
  }

--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
@@ -127,6 +127,8 @@ entry:
 }
 ; In -O2, the order of the CHECK-DAG lines in the output is switched.
 ; CHECK-LABEL: variable_n_align_32
+; CHECK:      push    ebp
+; CHECK:      mov     ebp,esp
 ; CHECK-DAG:  and     esp,0xffffffe0
 ; CHECK-DAG:  mov     eax,DWORD PTR [ebp+0x8]
 ; CHECK:      add     eax,0x1f
@@ -135,13 +137,19 @@ entry:
 ; CHECK:      sub     esp,0x10
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f2
+; CHECK:      mov     esp,ebp
+; CHECK:      pop     ebp

 ; ARM32-LABEL: variable_n_align_32
+; ARM32:      push {fp, lr}
+; ARM32:      mov fp, sp
 ; ARM32:      bic sp, sp, #31
 ; ARM32:      add r0, r0, #31
 ; ARM32:      bic r0, r0, #31
 ; ARM32:      sub sp, sp, r0
 ; ARM32:      bl {{.*}} R_{{.*}}    f2
+; ARM32:      mov sp, fp
+; ARM32:      pop {fp, lr}

 ; Test alloca with default (0) alignment.
 define void @align0(i32 %n) {

--- a/tests_lit/llvm2ice_tests/branch-opt.ll
+++ b/tests_lit/llvm2ice_tests/branch-opt.ll
@@ -93,9 +93,9 @@ target:
 ; ARM32O2-NEXT: cmp {{.*}}, #0
 ; ARM32O2-NEXT: bne
 ; ARM32O2-NEXT: bl
-; ARM32O2-NEXT: bx lr
-; ARM32O2-NEXT: bl
-; ARM32O2-NEXT: bx lr
+; ARM32O2: bx lr
+; ARM32O2: bl
+; ARM32O2: bx lr

 ; ARM32OM1-LABEL: testCondFallthroughToNextBlock
 ; ARM32OM1: cmp {{.*}}, #123
@@ -151,9 +151,9 @@ target:
 ; ARM32O2-NEXT: cmp {{.*}}, #0
 ; ARM32O2-NEXT: beq
 ; ARM32O2-NEXT: bl
-; ARM32O2-NEXT: bx lr
-; ARM32O2-NEXT: bl
-; ARM32O2-NEXT: bx lr
+; ARM32O2: bx lr
+; ARM32O2: bl
+; ARM32O2: bx lr

 ; ARM32OM1-LABEL: testCondTargetNextBlock
 ; ARM32OM1: cmp {{.*}}, #123

--- a/tests_lit/llvm2ice_tests/int-arg.ll
+++ b/tests_lit/llvm2ice_tests/int-arg.ll
@@ -73,8 +73,7 @@ entry:
 ; CHECK-NEXT: mov eax,{{.*}} [esp+0x14]
 ; CHECK-NEXT: ret
 ; ARM32-LABEL: test_returning32_arg4
-; TODO(jvoung): Toggle this on, once addProlog is done.
-; TODOARM32-NEXT: ldr r0, [sp]
+; ARM32-NEXT: ldr r0, [sp]
 ; ARM32-NEXT: bx lr


@@ -86,8 +85,7 @@ entry:
 ; CHECK-NEXT: mov eax,{{.*}} [esp+0x18]
 ; CHECK-NEXT: ret
 ; ARM32-LABEL: test_returning32_arg5
-; TODO(jvoung): Toggle this on, once addProlog is done.
-; TODOARM32-NEXT: ldr r0, [sp, #4]
+; ARM32-NEXT: ldr r0, [sp, #4]
 ; ARM32-NEXT: bx lr

 ; i64
@@ -126,9 +124,8 @@ entry:
 ; CHECK: ret
 ; ARM32-LABEL: test_returning64_arg2
 ; This could have been a ldm sp, {r0, r1}, but we don't do the ldm optimization.
-; TODO(jvoung): enable this once addProlog is done.
-; TODOARM32-NEXT: ldr r0, [sp]
-; TODOARM32-NEXT: ldr r1, [sp, #4]
+; ARM32-NEXT: ldr r0, [sp]
+; ARM32-NEXT: ldr r1, [sp, #4]
 ; ARM32-NEXT: bx lr

 define i64 @test_returning64_arg3(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3) {
@@ -140,9 +137,8 @@ entry:
 ; CHECK-NEXT: mov {{.*}} [esp+0x20]
 ; CHECK: ret
 ; ARM32-LABEL: test_returning64_arg3
-; TODO(jvoung): enable this once addProlog is done.
-; TODOARM32-NEXT: ldr r0, [sp, #8]
-; TODOARM32-NEXT: ldr r1, [sp, #12]
+; ARM32-NEXT: ldr r0, [sp, #8]
+; ARM32-NEXT: ldr r1, [sp, #12]
 ; ARM32-NEXT: bx lr


@@ -185,9 +181,8 @@ entry:
 ; CHECK-NEXT: mov {{.*}} [esp+0x14]
 ; CHECK: ret
 ; ARM32-LABEL: test_returning64_even_arg2
-; TODO(jvoung): enable this once addProlog is done.
-; TODOARM32-NEXT: ldr r0, [sp]
-; TODOARM32-NEXT: ldr r1, [sp, #4]
+; ARM32-NEXT: ldr r0, [sp]
+; ARM32-NEXT: ldr r1, [sp, #4]
 ; ARM32-NEXT: bx lr

 define i64 @test_returning64_even_arg2b(i64 %arg0, i32 %arg1, i32 %arg1b, i64 %arg2) {
@@ -199,9 +194,8 @@ entry:
 ; CHECK-NEXT: mov {{.*}} [esp+0x18]
 ; CHECK: ret
 ; ARM32-LABEL: test_returning64_even_arg2b
-; TODO(jvoung): enable this once addProlog is done.
-; TODOARM32-NEXT: ldr r0, [sp]
-; TODOARM32-NEXT: ldr r1, [sp, #4]
+; ARM32-NEXT: ldr r0, [sp]
+; ARM32-NEXT: ldr r1, [sp, #4]
 ; ARM32-NEXT: bx lr

 define i32 @test_returning32_even_arg2(i64 %arg0, i32 %arg1, i32 %arg2) {
@@ -236,8 +230,7 @@ entry:
 ; CHECK-NEXT: mov {{.*}} [esp+0x18]
 ; CHECK-NEXT: ret
 ; ARM32-LABEL: test_returning32_even_arg4
-; TODO(jvoung): enable this once addProlog is done.
-; TODOARM32-NEXT: ldr r0, [sp, #8]
+; ARM32-NEXT: ldr r0, [sp, #8]
 ; ARM32-NEXT: bx lr

 ; Test interleaving float/double and integer (different register streams on ARM).