Subzero: Use the linear-scan register allocator for Om1 as well.

This removes the need for Om1's postLower() code which did its own ad-hoc register allocation. And it actually speeds up Om1 translation significantly. This mode of register allocation only allocates for infinite-weight Variables, while respecting live ranges of pre-colored Variables. BUG= none R=jvoung@chromium.org Review URL: https://codereview.chromium.org/733643005

Subzero: Use the linear-scan register allocator for Om1 as well.
70d0a054 · Jim Stichnoth · edc115ec · 70d0a054 · 70d0a054 · 70d0a054
Commit 70d0a054 authored Nov 14, 2014 by Jim Stichnoth
9 changed files
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -102,6 +102,11 @@ enum LivenessMode {
  Liveness_Intervals
 };

+enum RegAllocKind {
+  RAK_Global, // full, global register allocation
+  RAK_InfOnly // allocation only for infinite-weight Variables
+};
+
 enum VerboseItem {
  IceV_None = 0,
  IceV_Instructions = 1 << 0,

--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -73,17 +73,16 @@ void dumpLiveRange(const Variable *Var, const Cfg *Func) {

 } // end of anonymous namespace

-void LinearScan::initForGlobalAlloc() {
+// Prepare for full register allocation of all variables.  We depend
+// on liveness analysis to have calculated live ranges.
+void LinearScan::initForGlobal() {
  TimerMarker T(TimerStack::TT_initUnhandled, Func);
-  Unhandled.clear();
-  UnhandledPrecolored.clear();
-  Handled.clear();
-  Inactive.clear();
-  Active.clear();
-  // Gather the live ranges of all variables and add them to the
-  // Unhandled set.
+  FindPreference = true;
+  FindOverlap = true;
  const VarList &Vars = Func->getVariables();
  Unhandled.reserve(Vars.size());
+  // Gather the live ranges of all variables and add them to the
+  // Unhandled set.
  for (Variable *Var : Vars) {
    // Explicitly don't consider zero-weight variables, which are
    // meant to be spill slots.
@@ -101,6 +100,128 @@ void LinearScan::initForGlobalAlloc() {
      UnhandledPrecolored.push_back(Var);
    }
  }
+
+  // Build the (ordered) list of FakeKill instruction numbers.
+  Kills.clear();
+  for (CfgNode *Node : Func->getNodes()) {
+    for (auto I = Node->getInsts().begin(), E = Node->getInsts().end(); I != E;
+         ++I) {
+      if (auto Kill = llvm::dyn_cast<InstFakeKill>(I)) {
+        if (!Kill->isDeleted() && !Kill->getLinked()->isDeleted())
+          Kills.push_back(I->getNumber());
+      }
+    }
+  }
+}
+
+// Prepare for very simple register allocation of only infinite-weight
+// Variables while respecting pre-colored Variables.  Some properties
+// we take advantage of:
+//
+// * Live ranges of interest consist of a single segment.
+//
+// * Live ranges of interest never span a call instruction.
+//
+// * Phi instructions are not considered because either phis have
+//   already been lowered, or they don't contain any pre-colored or
+//   infinite-weight Variables.
+//
+// * We don't need to renumber instructions before computing live
+//   ranges because all the high-level ICE instructions are deleted
+//   prior to lowering, and the low-level instructions are added in
+//   monotonically increasing order.
+//
+// * There are no opportunities for register preference or allowing
+//   overlap.
+//
+// Some properties we aren't (yet) taking advantage of:
+//
+// * Because live ranges are a single segment, the Unhandled set will
+//   always be empty, and the live range trimming operation is
+//   unnecessary.
+//
+// * Calculating overlap of single-segment live ranges could be
+//   optimized a bit.
+void LinearScan::initForInfOnly() {
+  TimerMarker T(TimerStack::TT_initUnhandled, Func);
+  FindPreference = false;
+  FindOverlap = false;
+  SizeT NumVars = 0;
+  const VarList &Vars = Func->getVariables();
+
+  // Iterate across all instructions and record the begin and end of
+  // the live range for each variable that is pre-colored or infinite
+  // weight.
+  std::vector<InstNumberT> LRBegin(Vars.size(), Inst::NumberSentinel);
+  std::vector<InstNumberT> LREnd(Vars.size(), Inst::NumberSentinel);
+  for (CfgNode *Node : Func->getNodes()) {
+    for (auto Inst = Node->getInsts().begin(), E = Node->getInsts().end();
+         Inst != E; ++Inst) {
+      if (Inst->isDeleted())
+        continue;
+      if (const Variable *Var = Inst->getDest()) {
+        if (Var->hasReg() || Var->getWeight() == RegWeight::Inf) {
+          if (LRBegin[Var->getIndex()] == Inst::NumberSentinel) {
+            LRBegin[Var->getIndex()] = Inst->getNumber();
+            ++NumVars;
+          }
+        }
+      }
+      for (SizeT I = 0; I < Inst->getSrcSize(); ++I) {
+        Operand *Src = Inst->getSrc(I);
+        SizeT NumVars = Src->getNumVars();
+        for (SizeT J = 0; J < NumVars; ++J) {
+          const Variable *Var = Src->getVar(J);
+          if (Var->hasReg() || Var->getWeight() == RegWeight::Inf)
+            LREnd[Var->getIndex()] = Inst->getNumber();
+        }
+      }
+    }
+  }
+
+  Unhandled.reserve(NumVars);
+  for (SizeT i = 0; i < Vars.size(); ++i) {
+    Variable *Var = Vars[i];
+    if (LRBegin[i] != Inst::NumberSentinel) {
+      assert(LREnd[i] != Inst::NumberSentinel);
+      Unhandled.push_back(Var);
+      Var->resetLiveRange();
+      const uint32_t WeightDelta = 1;
+      Var->addLiveRange(LRBegin[i], LREnd[i], WeightDelta);
+      Var->untrimLiveRange();
+      if (Var->hasReg()) {
+        Var->setRegNumTmp(Var->getRegNum());
+        Var->setLiveRangeInfiniteWeight();
+        UnhandledPrecolored.push_back(Var);
+      }
+      --NumVars;
+    }
+  }
+  // This isn't actually a fatal condition, but it would be nice to
+  // know if we somehow pre-calculated Unhandled's size wrong.
+  assert(NumVars == 0);
+
+  // Don't build up the list of Kills because we know that no
+  // infinite-weight Variable has a live range spanning a call.
+  Kills.clear();
+}
+
+void LinearScan::init(RegAllocKind Kind) {
+  Unhandled.clear();
+  UnhandledPrecolored.clear();
+  Handled.clear();
+  Inactive.clear();
+  Active.clear();
+
+  switch (Kind) {
+  case RAK_Global:
+    initForGlobal();
+    break;
+  case RAK_InfOnly:
+    initForInfOnly();
+    break;
+  }
+
  struct CompareRanges {
    bool operator()(const Variable *L, const Variable *R) {
      InstNumberT Lstart = L->getLiveRange().getStart();
@@ -114,20 +235,6 @@ void LinearScan::initForGlobalAlloc() {
  std::sort(Unhandled.rbegin(), Unhandled.rend(), CompareRanges());
  std::sort(UnhandledPrecolored.rbegin(), UnhandledPrecolored.rend(),
            CompareRanges());
-
-  // Build the (ordered) list of FakeKill instruction numbers.
-  Kills.clear();
-  for (CfgNode *Node : Func->getNodes()) {
-    for (auto I = Node->getInsts().begin(), E = Node->getInsts().end(); I != E;
-         ++I) {
-      if (I->isDeleted())
-        continue;
-      if (auto Kill = llvm::dyn_cast<InstFakeKill>(I)) {
-        if (!Kill->getLinked()->isDeleted())
-          Kills.push_back(I->getNumber());
-      }
-    }
-  }
 }

 // Implements the linear-scan algorithm.  Based on "Linear Scan
@@ -292,41 +399,41 @@ void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
    Variable *Prefer = NULL;
    int32_t PreferReg = Variable::NoRegister;
    bool AllowOverlap = false;
-    if (const Inst *DefInst = VMetadata->getFirstDefinition(Cur)) {
-      assert(DefInst->getDest() == Cur);
-      bool IsAssign = DefInst->isSimpleAssign();
-      bool IsSingleDef = !VMetadata->isMultiDef(Cur);
-      for (SizeT i = 0; i < DefInst->getSrcSize(); ++i) {
-        // TODO(stichnot): Iterate through the actual Variables of the
-        // instruction, not just the source operands.  This could
-        // capture Load instructions, including address mode
-        // optimization, for Prefer (but not for AllowOverlap).
-        if (Variable *SrcVar = llvm::dyn_cast<Variable>(DefInst->getSrc(i))) {
-          int32_t SrcReg = SrcVar->getRegNumTmp();
-          // Only consider source variables that have (so far) been
-          // assigned a register.  That register must be one in the
-          // RegMask set, e.g. don't try to prefer the stack pointer
-          // as a result of the stacksave intrinsic.
-          if (SrcVar->hasRegTmp() && RegMask[SrcReg]) {
-            if (!Free[SrcReg]) {
-              // Don't bother trying to enable AllowOverlap if the
-              // register is already free.
-              AllowOverlap =
-                  IsSingleDef && IsAssign && !overlapsDefs(Func, Cur, SrcVar);
-            }
-            if (AllowOverlap || Free[SrcReg]) {
-              Prefer = SrcVar;
-              PreferReg = SrcReg;
+    if (FindPreference) {
+      if (const Inst *DefInst = VMetadata->getFirstDefinition(Cur)) {
+        assert(DefInst->getDest() == Cur);
+        bool IsAssign = DefInst->isSimpleAssign();
+        bool IsSingleDef = !VMetadata->isMultiDef(Cur);
+        for (SizeT i = 0; i < DefInst->getSrcSize(); ++i) {
+          // TODO(stichnot): Iterate through the actual Variables of the
+          // instruction, not just the source operands.  This could
+          // capture Load instructions, including address mode
+          // optimization, for Prefer (but not for AllowOverlap).
+          if (Variable *SrcVar = llvm::dyn_cast<Variable>(DefInst->getSrc(i))) {
+            int32_t SrcReg = SrcVar->getRegNumTmp();
+            // Only consider source variables that have (so far) been
+            // assigned a register.  That register must be one in the
+            // RegMask set, e.g. don't try to prefer the stack pointer
+            // as a result of the stacksave intrinsic.
+            if (SrcVar->hasRegTmp() && RegMask[SrcReg]) {
+              if (FindOverlap && !Free[SrcReg]) {
+                // Don't bother trying to enable AllowOverlap if the
+                // register is already free.
+                AllowOverlap =
+                    IsSingleDef && IsAssign && !overlapsDefs(Func, Cur, SrcVar);
+              }
+              if (AllowOverlap || Free[SrcReg]) {
+                Prefer = SrcVar;
+                PreferReg = SrcReg;
+              }
            }
          }
        }
-      }
-    }
-    if (Verbose) {
-      if (Prefer) {
-        Str << "Initial Prefer=" << *Prefer << " R=" << PreferReg
-            << " LIVE=" << Prefer->getLiveRange() << " Overlap=" << AllowOverlap
-            << "\n";
+        if (Verbose && Prefer) {
+          Str << "Initial Prefer=" << *Prefer << " R=" << PreferReg
+              << " LIVE=" << Prefer->getLiveRange()
+              << " Overlap=" << AllowOverlap << "\n";
+        }
      }
    }

@@ -353,12 +460,14 @@ void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
    // Disable AllowOverlap if an Active variable, which is not
    // Prefer, shares Prefer's register, and has a definition within
    // Cur's live range.
-    for (const Variable *Item : Active) {
-      int32_t RegNum = Item->getRegNumTmp();
-      if (Item != Prefer && RegNum == PreferReg &&
-          overlapsDefs(Func, Cur, Item)) {
-        AllowOverlap = false;
-        dumpDisableOverlap(Func, Item, "Active");
+    if (AllowOverlap) {
+      for (const Variable *Item : Active) {
+        int32_t RegNum = Item->getRegNumTmp();
+        if (Item != Prefer && RegNum == PreferReg &&
+            overlapsDefs(Func, Cur, Item)) {
+          AllowOverlap = false;
+          dumpDisableOverlap(Func, Item, "Active");
+        }
      }
    }


--- a/src/IceRegAlloc.h
+++ b/src/IceRegAlloc.h
@@ -26,12 +26,16 @@ class LinearScan {
  LinearScan &operator=(const LinearScan &) = delete;

 public:
-  LinearScan(Cfg *Func) : Func(Func) {}
-  void initForGlobalAlloc();
+  LinearScan(Cfg *Func)
+      : Func(Func), FindPreference(false), FindOverlap(false) {}
+  void init(RegAllocKind Kind);
  void scan(const llvm::SmallBitVector &RegMask);
  void dump(Cfg *Func) const;

 private:
+  void initForGlobal();
+  void initForInfOnly();
+
  Cfg *const Func;
  typedef std::vector<Variable *> OrderedRanges;
  typedef std::list<Variable *> UnorderedRanges;
@@ -41,6 +45,12 @@ private:
  OrderedRanges UnhandledPrecolored;
  UnorderedRanges Active, Inactive, Handled;
  std::vector<InstNumberT> Kills;
+  bool FindPreference;
+  bool FindOverlap;
+  // TODO(stichnot): We're not really using FindOverlap yet, but we
+  // may want a flavor of register allocation where FindPreference is
+  // useful but we didn't want to initialize VMetadata with VMK_All
+  // and therefore we can't safely allow overlap.
 };

 } // end of namespace Ice

--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -225,7 +225,7 @@ void TargetLowering::lower() {
 // perhaps for the frame pointer) to be allocated.  This set of
 // registers could potentially be parameterized if we want to restrict
 // registers e.g. for performance testing.
-void TargetLowering::regAlloc() {
+void TargetLowering::regAlloc(RegAllocKind Kind) {
  TimerMarker T(TimerStack::TT_regAlloc, Func);
  LinearScan LinearScan(Func);
  RegSetMask RegInclude = RegSet_None;
@@ -234,7 +234,7 @@ void TargetLowering::regAlloc() {
  RegInclude |= RegSet_CalleeSave;
  if (hasFramePointer())
    RegExclude |= RegSet_FramePointer;
-  LinearScan.initForGlobalAlloc();
+  LinearScan.init(Kind);
  llvm::SmallBitVector RegMask = getRegisterSet(RegInclude, RegExclude);
  LinearScan.scan(RegMask);
 }

--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -195,7 +195,7 @@ public:
  virtual llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                              RegSetMask Exclude) const = 0;
  virtual const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const = 0;
-  void regAlloc();
+  void regAlloc(RegAllocKind Kind);

  virtual void emitVariable(const Variable *Var) const = 0;

@@ -236,11 +236,7 @@ protected:
  virtual void doAddressOptStore() {}
  virtual void randomlyInsertNop(float Probability) = 0;
  // This gives the target an opportunity to post-process the lowered
-  // expansion before returning.  The primary intention is to do some
-  // Register Manager activity as necessary, specifically to eagerly
-  // allocate registers based on affinity and other factors.  The
-  // simplest lowering does nothing here and leaves it all to a
-  // subsequent global register allocation pass.
+  // expansion before returning.
  virtual void postLower() {}

  Cfg *Func;

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -9,9 +9,7 @@
 //
 // This file implements the TargetLoweringX8632 class, which
 // consists almost entirely of the lowering sequence for each
-// high-level instruction.  It also implements
-// TargetX8632Fast::postLower() which does the simplest possible
-// register allocation for the "fast" target.
+// high-level instruction.
 //
 //===----------------------------------------------------------------------===//

@@ -375,7 +373,7 @@ void TargetX8632::translateO2() {
  // associated cleanup, to make the dump cleaner and more useful.
  Func->dump("After initial x8632 codegen");
  Func->getVMetadata()->init(VMK_All);
-  regAlloc();
+  regAlloc(RAK_Global);
  if (Func->hasError())
    return;
  Func->dump("After linear scan regalloc");
@@ -429,6 +427,11 @@ void TargetX8632::translateOm1() {
    return;
  Func->dump("After initial x8632 codegen");

+  regAlloc(RAK_InfOnly);
+  if (Func->hasError())
+    return;
+  Func->dump("After regalloc of infinite-weight variables");
+
  Func->genFrame();
  if (Func->hasError())
    return;
@@ -1816,9 +1819,6 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
  // stack locations.
  for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
    lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
-    // TODO: Consider calling postLower() here to reduce the register
-    // pressure associated with using too many infinite weight
-    // temporaries when lowering the call sequence in -Om1 mode.
  }

  // Copy arguments to be passed in registers to the appropriate
@@ -4112,8 +4112,6 @@ void TargetX8632::scalarizeArithmetic(InstArithmetic::OpKind Kind,
    Variable *DestT = Func->makeVariable(Ty);
    lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index));
    T = DestT;
-    // TODO(stichnot): Use postLower() in -Om1 mode to avoid buildup of
-    // infinite weight temporaries.
  }

  lowerAssign(InstAssign::create(Func, Dest, T));
@@ -4200,7 +4198,7 @@ void TargetX8632::lowerPhiAssignments(CfgNode *Node,
  assert(Node->getPhis().empty());
  CfgNode *Succ = Node->getOutEdges().front();
  getContext().init(Node);
-  // Register set setup similar to regAlloc() and postLower().
+  // Register set setup similar to regAlloc().
  RegSetMask RegInclude = RegSet_All;
  RegSetMask RegExclude = RegSet_StackPointer;
  if (hasFramePointer())
@@ -4512,115 +4510,20 @@ Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) {
 }

 void TargetX8632::postLower() {
-  if (Ctx->getOptLevel() != Opt_m1) {
-    // Find two-address non-SSA instructions where Dest==Src0, and set
-    // the DestNonKillable flag to keep liveness analysis consistent.
-    for (auto Inst = Context.begin(), E = Context.end(); Inst != E; ++Inst) {
-      if (Inst->isDeleted())
-        continue;
-      if (Variable *Dest = Inst->getDest()) {
-        // TODO(stichnot): We may need to consider all source
-        // operands, not just the first one, if using 3-address
-        // instructions.
-        if (Inst->getSrcSize() > 0 && Inst->getSrc(0) == Dest)
-          Inst->setDestNonKillable();
-      }
-    }
+  if (Ctx->getOptLevel() == Opt_m1)
    return;
-  }
-  // TODO: Avoid recomputing WhiteList every instruction.
-  RegSetMask RegInclude = RegSet_All;
-  RegSetMask RegExclude = RegSet_StackPointer;
-  if (hasFramePointer())
-    RegExclude |= RegSet_FramePointer;
-  llvm::SmallBitVector WhiteList = getRegisterSet(RegInclude, RegExclude);
-  // Make one pass to black-list pre-colored registers.  TODO: If
-  // there was some prior register allocation pass that made register
-  // assignments, those registers need to be black-listed here as
-  // well.
-  llvm::DenseMap<const Variable *, const Inst *> LastUses;
-  // The first pass also keeps track of which instruction is the last
-  // use for each infinite-weight variable.  After the last use, the
-  // variable is released to the free list.
-  for (auto Inst = Context.begin(), E = Context.end(); Inst != E; ++Inst) {
-    if (Inst->isDeleted())
-      continue;
-    // Don't consider a FakeKill instruction, because (currently) it
-    // is only used to kill all scratch registers at a call site, and
-    // we don't want to black-list all scratch registers during the
-    // call lowering.  This could become a problem since it relies on
-    // the lowering sequence not keeping any infinite-weight variables
-    // live across a call.  TODO(stichnot): Consider replacing this
-    // whole postLower() implementation with a robust local register
-    // allocator, for example compute live ranges only for pre-colored
-    // and infinite-weight variables and run the existing linear-scan
-    // allocator.
-    assert(!llvm::isa<InstFakeKill>(Inst) || Inst->getSrcSize() == 0);
-    for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) {
-      Operand *Src = Inst->getSrc(SrcNum);
-      SizeT NumVars = Src->getNumVars();
-      for (SizeT J = 0; J < NumVars; ++J) {
-        const Variable *Var = Src->getVar(J);
-        // Track last uses of all variables, regardless of whether
-        // they are pre-colored or infinite-weight.
-        LastUses[Var] = Inst;
-        if (!Var->hasReg())
-          continue;
-        WhiteList[Var->getRegNum()] = false;
-      }
-    }
-  }
-  // The second pass colors infinite-weight variables.
-  llvm::SmallBitVector AvailableRegisters = WhiteList;
-  llvm::SmallBitVector FreedRegisters(WhiteList.size());
+  // Find two-address non-SSA instructions where Dest==Src0, and set
+  // the DestNonKillable flag to keep liveness analysis consistent.
  for (auto Inst = Context.begin(), E = Context.end(); Inst != E; ++Inst) {
-    FreedRegisters.reset();
    if (Inst->isDeleted())
      continue;
-    // Iterate over all variables referenced in the instruction,
-    // including the Dest variable (if any).  If the variable is
-    // marked as infinite-weight, find it a register.  If this
-    // instruction is the last use of the variable in the lowered
-    // sequence, release the register to the free list after this
-    // instruction is completely processed.  Note that the first pass
-    // ignores the Dest operand, under the assumption that a
-    // pre-colored Dest will appear as a source operand in some
-    // subsequent instruction in the lowered sequence.
-    Variable *Dest = Inst->getDest();
-    SizeT NumSrcs = Inst->getSrcSize();
-    if (Dest)
-      ++NumSrcs;
-    if (NumSrcs == 0)
-      continue;
-    OperandList Srcs(NumSrcs);
-    for (SizeT i = 0; i < Inst->getSrcSize(); ++i)
-      Srcs[i] = Inst->getSrc(i);
-    if (Dest)
-      Srcs[NumSrcs - 1] = Dest;
-    for (SizeT SrcNum = 0; SrcNum < NumSrcs; ++SrcNum) {
-      Operand *Src = Srcs[SrcNum];
-      SizeT NumVars = Src->getNumVars();
-      for (SizeT J = 0; J < NumVars; ++J) {
-        Variable *Var = Src->getVar(J);
-        if (!Var->hasReg() && Var->getWeight().isInf()) {
-          llvm::SmallBitVector AvailableTypedRegisters =
-              AvailableRegisters & getRegisterSetForType(Var->getType());
-          assert(AvailableTypedRegisters.any());
-          int32_t RegNum = AvailableTypedRegisters.find_first();
-          Var->setRegNum(RegNum);
-          AvailableRegisters[RegNum] = false;
-        }
-        if (Var->hasReg()) {
-          int32_t RegNum = Var->getRegNum();
-          assert(!AvailableRegisters[RegNum]);
-          if (LastUses[Var] == Inst) {
-            if (WhiteList[RegNum])
-              FreedRegisters[RegNum] = true;
-          }
-        }
-      }
+    if (Variable *Dest = Inst->getDest()) {
+      // TODO(stichnot): We may need to consider all source
+      // operands, not just the first one, if using 3-address
+      // instructions.
+      if (Inst->getSrcSize() > 0 && Inst->getSrc(0) == Dest)
+        Inst->setDestNonKillable();
    }
-    AvailableRegisters |= FreedRegisters;
  }
 }


--- a/src/IceTranslator.cpp
+++ b/src/IceTranslator.cpp
@@ -83,10 +83,12 @@ void Translator::translateFcn(Cfg *Fcn) {
      ErrorStatus = true;
    }

-    if (Ctx->getFlags().UseIntegratedAssembler) {
-      Func->emitIAS();
-    } else {
-      Func->emit();
+    if (!ErrorStatus) {
+      if (Ctx->getFlags().UseIntegratedAssembler) {
+        Func->emitIAS();
+      } else {
+        Func->emit();
+      }
    }
    Ctx->dumpStats(Func->getFunctionName());
  }

--- a/tests_lit/llvm2ice_tests/ebp_args.ll
+++ b/tests_lit/llvm2ice_tests/ebp_args.ll
@@ -38,10 +38,10 @@ entry:
 ; CHECK:  movzx eax, byte ptr [ebp - 4]
 ; CHECK:  mov   dword ptr [ebp - 16], eax
 ; CHECK:  sub   esp, 16
-; CHECK:  mov   ecx, dword ptr [ebp + 8]
-; CHECK:  mov   dword ptr [esp], ecx
-; CHECK:  mov   ecx, dword ptr [ebp - 12]
-; CHECK:  mov   dword ptr [esp + 4], ecx
-; CHECK:  mov   ecx, dword ptr [ebp - 16]
-; CHECK:  mov   dword ptr [esp + 8], ecx
+; CHECK:  mov   eax, dword ptr [ebp + 8]
+; CHECK:  mov   dword ptr [esp], eax
+; CHECK:  mov   eax, dword ptr [ebp - 12]
+; CHECK:  mov   dword ptr [esp + 4], eax
+; CHECK:  mov   eax, dword ptr [ebp - 16]
+; CHECK:  mov   dword ptr [esp + 8], eax
 ; CHECK:  call  -4
--- a/tests_lit/llvm2ice_tests/nop-insertion.ll
+++ b/tests_lit/llvm2ice_tests/nop-insertion.ll
@@ -31,9 +31,9 @@ entry:
 ; PROB50: pmuludq %xmm2, %xmm1
 ; PROB50: nop # variant = 0
 ; PROB50: shufps $136, %xmm1, %xmm0
-; PROB50: pshufd $216, %xmm0, %xmm1
+; PROB50: pshufd $216, %xmm0, %xmm0
 ; PROB50: nop # variant = 2
-; PROB50: movups %xmm1, (%esp)
+; PROB50: movups %xmm0, (%esp)
 ; PROB50: movups (%esp), %xmm0
 ; PROB50: addl $60, %esp
 ; PROB50: nop # variant = 0
@@ -59,9 +59,9 @@ entry:
 ; PROB90: nop # variant = 3
 ; PROB90: shufps $136, %xmm1, %xmm0
 ; PROB90: nop # variant = 4
-; PROB90: pshufd $216, %xmm0, %xmm1
+; PROB90: pshufd $216, %xmm0, %xmm0
 ; PROB90: nop # variant = 2
-; PROB90: movups %xmm1, (%esp)
+; PROB90: movups %xmm0, (%esp)
 ; PROB90: nop # variant = 4
 ; PROB90: movups (%esp), %xmm0
 ; PROB90: nop # variant = 2
@@ -86,9 +86,9 @@ entry:
 ; MAXNOPS2: nop # variant = 3
 ; MAXNOPS2: pmuludq %xmm2, %xmm1
 ; MAXNOPS2: shufps $136, %xmm1, %xmm0
-; MAXNOPS2: pshufd $216, %xmm0, %xmm1
+; MAXNOPS2: pshufd $216, %xmm0, %xmm0
 ; MAXNOPS2: nop # variant = 3
-; MAXNOPS2: movups %xmm1, (%esp)
+; MAXNOPS2: movups %xmm0, (%esp)
 ; MAXNOPS2: nop # variant = 0
 ; MAXNOPS2: movups (%esp), %xmm0
 ; MAXNOPS2: nop # variant = 2