Subzero: Initial O2 lowering

Includes the following: 1. Liveness analysis. 2. Linear-scan register allocation. 3. Address mode optimization. 4. Compare-branch fusing. All of these depend on liveness analysis. There are three versions of liveness analysis (in order of increasing cost): 1. Lightweight. This computes last-uses for variables local to a single basic block. 2. Full. This computes last-uses for all variables based on global dataflow analysis. 3. Full live ranges. This computes all last-uses, plus calculates the live range intervals in terms of instruction numbers. (The live ranges are needed for register allocation.) For testing the full live range computation, Cfg::validateLiveness() checks every Variable of every Inst and verifies that the current Inst is contained within the Variable's live range. The cross tests are run with O2 in addition to Om1. Some of the lit tests (for what good they do) are updated with O2 code sequences. BUG= none R=jvoung@chromium.org Review URL: https://codereview.chromium.org/300563003

Subzero: Initial O2 lowering
d97c7df5 · Jim Stichnoth · 88a485ed · d97c7df5 · d97c7df5 · d97c7df5
Commit d97c7df5 authored Jun 04, 2014 by Jim Stichnoth
34 changed files
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -39,7 +39,9 @@ SRCS= \
 	IceGlobalContext.cpp \
 	IceInst.cpp \
 	IceInstX8632.cpp \
+	IceLiveness.cpp \
 	IceOperand.cpp \
+	IceRegAlloc.cpp \
 	IceTargetLowering.cpp \
 	IceTargetLoweringX8632.cpp \
 	IceTypes.cpp \

--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -5,7 +5,7 @@
 set -eux
-OPTLEVELS="m1"
+OPTLEVELS="m1 2"
 OUTDIR=Output
 # Clean the output directory to avoid reusing stale results.
 rm -rf "${OUTDIR}"

--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -16,6 +16,7 @@
 #include "IceCfgNode.h"
 #include "IceDefs.h"
 #include "IceInst.h"
+#include "IceLiveness.h"
 #include "IceOperand.h"
 #include "IceTargetLowering.h"
@@ -24,7 +25,7 @@ namespace Ice {
 Cfg::Cfg(GlobalContext *Ctx)
    : Ctx(Ctx), FunctionName(""), ReturnType(IceType_void),
      IsInternalLinkage(false), HasError(false), ErrorMessage(""), Entry(NULL),
-      NextInstNumber(1),
+      NextInstNumber(1), Live(NULL),
      Target(TargetLowering::createLowering(Ctx->getTargetArch(), this)),
      CurrentNode(NULL) {}
@@ -62,14 +63,10 @@ void Cfg::addArg(Variable *Arg) {
 bool Cfg::hasComputedFrame() const { return getTarget()->hasComputedFrame(); }
 void Cfg::translate() {
-  Ostream &Str = Ctx->getStrDump();
  if (hasError())
    return;
-  if (Ctx->isVerbose()) {
+  dump("Initial CFG");
-    Str << "================ Initial CFG ================\n";
-    dump();
-  }
  Timer T_translate;
  // The set of translation passes and their order are determined by
@@ -77,10 +74,7 @@ void Cfg::translate() {
  getTarget()->translate();
  T_translate.printElapsedUs(getContext(), "translate()");
-  if (Ctx->isVerbose()) {
+  dump("Final output");
-    Str << "================ Final output ================\n";
-    dump();
-  }
 }
 void Cfg::computePredecessors() {
@@ -89,6 +83,13 @@ void Cfg::computePredecessors() {
  }
 }
+void Cfg::renumberInstructions() {
+  NextInstNumber = 1;
+  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    (*I)->renumberInstructions();
+  }
+}
 // placePhiLoads() must be called before placePhiStores().
 void Cfg::placePhiLoads() {
  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
@@ -109,6 +110,12 @@ void Cfg::deletePhis() {
  }
 }
+void Cfg::doAddressOpt() {
+  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    (*I)->doAddressOpt();
+  }
+}
 void Cfg::genCode() {
  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
    (*I)->genCode();
@@ -128,6 +135,150 @@ void Cfg::genFrame() {
  }
 }
+// This is a lightweight version of live-range-end calculation.  Marks
+// the last use of only those variables whose definition and uses are
+// completely with a single block.  It is a quick single pass and
+// doesn't need to iterate until convergence.
+void Cfg::livenessLightweight() {
+  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    (*I)->livenessLightweight();
+  }
+}
+void Cfg::liveness(LivenessMode Mode) {
+  Live.reset(new Liveness(this, Mode));
+  Live->init();
+  // Initialize with all nodes needing to be processed.
+  llvm::BitVector NeedToProcess(Nodes.size(), true);
+  while (NeedToProcess.any()) {
+    // Iterate in reverse topological order to speed up convergence.
+    for (NodeList::reverse_iterator I = Nodes.rbegin(), E = Nodes.rend();
+         I != E; ++I) {
+      CfgNode *Node = *I;
+      if (NeedToProcess[Node->getIndex()]) {
+        NeedToProcess[Node->getIndex()] = false;
+        bool Changed = Node->liveness(getLiveness());
+        if (Changed) {
+          // If the beginning-of-block liveness changed since the last
+          // iteration, mark all in-edges as needing to be processed.
+          const NodeList &InEdges = Node->getInEdges();
+          for (NodeList::const_iterator I1 = InEdges.begin(),
+                                        E1 = InEdges.end();
+               I1 != E1; ++I1) {
+            CfgNode *Pred = *I1;
+            NeedToProcess[Pred->getIndex()] = true;
+          }
+        }
+      }
+    }
+  }
+  if (Mode == Liveness_Intervals) {
+    // Reset each variable's live range.
+    for (VarList::const_iterator I = Variables.begin(), E = Variables.end();
+         I != E; ++I) {
+      if (Variable *Var = *I)
+        Var->resetLiveRange();
+    }
+  }
+  // Collect timing for just the portion that constructs the live
+  // range intervals based on the end-of-live-range computation, for a
+  // finer breakdown of the cost.
+  Timer T_liveRange;
+  // Make a final pass over instructions to delete dead instructions
+  // and build each Variable's live range.
+  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    (*I)->livenessPostprocess(Mode, getLiveness());
+  }
+  if (Mode == Liveness_Intervals) {
+    // Special treatment for live in-args.  Their liveness needs to
+    // extend beyond the beginning of the function, otherwise an arg
+    // whose only use is in the first instruction will end up having
+    // the trivial live range [1,1) and will *not* interfere with
+    // other arguments.  So if the first instruction of the method is
+    // "r=arg1+arg2", both args may be assigned the same register.
+    for (SizeT I = 0; I < Args.size(); ++I) {
+      Variable *Arg = Args[I];
+      if (!Live->getLiveRange(Arg).isEmpty()) {
+        // Add live range [-1,0) with weight 0.  TODO: Here and below,
+        // use better symbolic constants along the lines of
+        // Inst::NumberDeleted and Inst::NumberSentinel instead of -1
+        // and 0.
+        Live->addLiveRange(Arg, -1, 0, 0);
+      }
+      // Do the same for i64 args that may have been lowered into i32
+      // Lo and Hi components.
+      Variable *Lo = Arg->getLo();
+      if (Lo && !Live->getLiveRange(Lo).isEmpty())
+        Live->addLiveRange(Lo, -1, 0, 0);
+      Variable *Hi = Arg->getHi();
+      if (Hi && !Live->getLiveRange(Hi).isEmpty())
+        Live->addLiveRange(Hi, -1, 0, 0);
+    }
+    // Copy Liveness::LiveRanges into individual variables.  TODO:
+    // Remove Variable::LiveRange and redirect to
+    // Liveness::LiveRanges.  TODO: make sure Variable weights
+    // are applied properly.
+    SizeT NumVars = Variables.size();
+    for (SizeT i = 0; i < NumVars; ++i) {
+      Variable *Var = Variables[i];
+      Var->setLiveRange(Live->getLiveRange(Var));
+      if (Var->getWeight().isInf())
+        Var->setLiveRangeInfiniteWeight();
+    }
+    T_liveRange.printElapsedUs(getContext(), "live range construction");
+    dump();
+  }
+}
+// Traverse every Variable of every Inst and verify that it
+// appears within the Variable's computed live range.
+bool Cfg::validateLiveness() const {
+  bool Valid = true;
+  Ostream &Str = Ctx->getStrDump();
+  for (NodeList::const_iterator I1 = Nodes.begin(), E1 = Nodes.end(); I1 != E1;
+       ++I1) {
+    CfgNode *Node = *I1;
+    InstList &Insts = Node->getInsts();
+    for (InstList::const_iterator I2 = Insts.begin(), E2 = Insts.end();
+         I2 != E2; ++I2) {
+      Inst *Inst = *I2;
+      if (Inst->isDeleted())
+        continue;
+      if (llvm::isa<InstFakeKill>(Inst))
+        continue;
+      InstNumberT InstNumber = Inst->getNumber();
+      Variable *Dest = Inst->getDest();
+      if (Dest) {
+        // TODO: This instruction should actually begin Dest's live
+        // range, so we could probably test that this instruction is
+        // the beginning of some segment of Dest's live range.  But
+        // this wouldn't work with non-SSA temporaries during
+        // lowering.
+        if (!Dest->getLiveRange().containsValue(InstNumber)) {
+          Valid = false;
+          Str << "Liveness error: inst " << Inst->getNumber() << " dest ";
+          Dest->dump(this);
+          Str << " live range " << Dest->getLiveRange() << "\n";
+        }
+      }
+      for (SizeT I = 0; I < Inst->getSrcSize(); ++I) {
+        Operand *Src = Inst->getSrc(I);
+        SizeT NumVars = Src->getNumVars();
+        for (SizeT J = 0; J < NumVars; ++J) {
+          const Variable *Var = Src->getVar(J);
+          if (!Var->getLiveRange().containsValue(InstNumber)) {
+            Valid = false;
+            Str << "Liveness error: inst " << Inst->getNumber() << " var ";
+            Var->dump(this);
+            Str << " live range " << Var->getLiveRange() << "\n";
+          }
+        }
+      }
+    }
+  }
+  return Valid;
+}
 // ======================== Dump routines ======================== //
 void Cfg::emit() {
@@ -158,8 +309,13 @@ void Cfg::emit() {
  T_emit.printElapsedUs(Ctx, "emit()");
 }
-void Cfg::dump() {
+// Dumps the IR with an optional introductory message.
+void Cfg::dump(const IceString &Message) {
+  if (!Ctx->isVerbose())
+    return;
  Ostream &Str = Ctx->getStrDump();
+  if (!Message.empty())
+    Str << "================ " << Message << " ================\n";
  setCurrentNode(getEntryNode());
  // Print function name+args
  if (getContext()->isVerbose(IceV_Instructions)) {
@@ -176,6 +332,18 @@ void Cfg::dump() {
    Str << ") {\n";
  }
  setCurrentNode(NULL);
+  if (getContext()->isVerbose(IceV_Liveness)) {
+    // Print summary info about variables
+    for (VarList::const_iterator I = Variables.begin(), E = Variables.end();
+         I != E; ++I) {
+      Variable *Var = *I;
+      Str << "//"
+          << " multiblock=" << Var->isMultiblockLife() << " "
+          << " weight=" << Var->getWeight() << " ";
+      Var->dump(this);
+      Str << " LIVE=" << Var->getLiveRange() << "\n";
+    }
+  }
  // Print each basic block
  for (NodeList::const_iterator I = Nodes.begin(), E = Nodes.end(); I != E;
       ++I) {

--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -58,7 +58,7 @@ public:
  const NodeList &getNodes() const { return Nodes; }
  // Manage instruction numbering.
-  int32_t newInstNumber() { return NextInstNumber++; }
+  InstNumberT newInstNumber() { return NextInstNumber++; }
  // Manage Variables.
  Variable *makeVariable(Type Ty, const CfgNode *Node,
@@ -72,6 +72,7 @@ public:
  // Miscellaneous accessors.
  TargetLowering *getTarget() const { return Target.get(); }
+  Liveness *getLiveness() const { return Live.get(); }
  bool hasComputedFrame() const;
  // Passes over the CFG.
@@ -80,11 +81,16 @@ public:
  // compute the predecessor edges, in the form of
  // CfgNode::InEdges[].
  void computePredecessors();
+  void renumberInstructions();
  void placePhiLoads();
  void placePhiStores();
  void deletePhis();
+  void doAddressOpt();
  void genCode();
  void genFrame();
+  void livenessLightweight();
+  void liveness(LivenessMode Mode);
+  bool validateLiveness() const;
  // Manage the CurrentNode field, which is used for validating the
  // Variable::DefNode field during dumping/emitting.
@@ -92,7 +98,7 @@ public:
  const CfgNode *getCurrentNode() const { return CurrentNode; }
  void emit();
-  void dump();
+  void dump(const IceString &Message = "");
  // Allocate data of type T using the per-Cfg allocator.
  template <typename T> T *allocate() { return Allocator.Allocate<T>(); }
@@ -136,9 +142,10 @@ private:
  IceString ErrorMessage;
  CfgNode *Entry; // entry basic block
  NodeList Nodes; // linearized node list; Entry should be first
-  int32_t NextInstNumber;
+  InstNumberT NextInstNumber;
  VarList Variables;
  VarList Args; // subset of Variables, in argument order
+  llvm::OwningPtr<Liveness> Live;
  llvm::OwningPtr<TargetLowering> Target;
  // CurrentNode is maintained during dumping/emitting just for

--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -15,6 +15,7 @@
 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceInst.h"
+#include "IceLiveness.h"
 #include "IceOperand.h"
 #include "IceTargetLowering.h"
@@ -49,6 +50,22 @@ void CfgNode::appendInst(Inst *Inst) {
  Inst->updateVars(this);
 }
+// Renumbers the non-deleted instructions in the node.  This needs to
+// be done in preparation for live range analysis.  The instruction
+// numbers in a block must be monotonically increasing.  The range of
+// instruction numbers in a block, from lowest to highest, must not
+// overlap with the range of any other block.
+void CfgNode::renumberInstructions() {
+  for (PhiList::const_iterator I = Phis.begin(), E = Phis.end(); I != E; ++I) {
+    (*I)->renumber(Func);
+  }
+  InstList::const_iterator I = Insts.begin(), E = Insts.end();
+  while (I != E) {
+    Inst *Inst = *I++;
+    Inst->renumber(Func);
+  }
+}
 // When a node is created, the OutEdges are immediately knows, but the
 // InEdges have to be built up incrementally.  After the CFG has been
 // constructed, the computePredecessors() pass finalizes it by
@@ -107,10 +124,25 @@ void CfgNode::placePhiStores() {
  // Every block must end in a terminator instruction.
  assert(InsertionPoint != Insts.begin());
  --InsertionPoint;
-  // Confirm via assert() that InsertionPoint is a terminator
+  // Confirm that InsertionPoint is a terminator instruction.  Calling
-  // instruction.  Calling getTerminatorEdges() on a non-terminator
+  // getTerminatorEdges() on a non-terminator instruction will cause
-  // instruction will cause an llvm_unreachable().
+  // an llvm_unreachable().
-  assert(((*InsertionPoint)->getTerminatorEdges(), true));
+  (void)(*InsertionPoint)->getTerminatorEdges();
+  // If the current insertion point is at a conditional branch
+  // instruction, and the previous instruction is a compare
+  // instruction, then we move the insertion point before the compare
+  // instruction so as not to interfere with compare/branch fusing.
+  if (InstBr *Branch = llvm::dyn_cast<InstBr>(*InsertionPoint)) {
+    if (!Branch->isUnconditional()) {
+      if (InsertionPoint != Insts.begin()) {
+        --InsertionPoint;
+        if (!llvm::isa<InstIcmp>(*InsertionPoint) &&
+            !llvm::isa<InstFcmp>(*InsertionPoint)) {
+          ++InsertionPoint;
+        }
+      }
+    }
+  }
  // Consider every out-edge.
  for (NodeList::const_iterator I1 = OutEdges.begin(), E1 = OutEdges.end();
@@ -145,6 +177,19 @@ void CfgNode::deletePhis() {
  }
 }
+// Does address mode optimization.  Pass each instruction to the
+// TargetLowering object.  If it returns a new instruction
+// (representing the optimized address mode), then insert the new
+// instruction and delete the old.
+void CfgNode::doAddressOpt() {
+  TargetLowering *Target = Func->getTarget();
+  LoweringContext &Context = Target->getContext();
+  Context.init(this);
+  while (!Context.atEnd()) {
+    Target->doAddressOpt();
+  }
+}
 // Drives the target lowering.  Passes the current instruction and the
 // next non-deleted instruction for target lowering.
 void CfgNode::genCode() {
@@ -162,6 +207,189 @@ void CfgNode::genCode() {
  }
 }
+void CfgNode::livenessLightweight() {
+  SizeT NumVars = Func->getNumVariables();
+  llvm::BitVector Live(NumVars);
+  // Process regular instructions in reverse order.
+  for (InstList::const_reverse_iterator I = Insts.rbegin(), E = Insts.rend();
+       I != E; ++I) {
+    if ((*I)->isDeleted())
+      continue;
+    (*I)->livenessLightweight(Live);
+  }
+  for (PhiList::const_iterator I = Phis.begin(), E = Phis.end(); I != E; ++I) {
+    if ((*I)->isDeleted())
+      continue;
+    (*I)->livenessLightweight(Live);
+  }
+}
+// Performs liveness analysis on the block.  Returns true if the
+// incoming liveness changed from before, false if it stayed the same.
+// (If it changes, the node's predecessors need to be processed
+// again.)
+bool CfgNode::liveness(Liveness *Liveness) {
+  SizeT NumVars = Liveness->getNumVarsInNode(this);
+  llvm::BitVector Live(NumVars);
+  // Mark the beginning and ending of each variable's live range
+  // with the sentinel instruction number 0.
+  std::vector<InstNumberT> &LiveBegin = Liveness->getLiveBegin(this);
+  std::vector<InstNumberT> &LiveEnd = Liveness->getLiveEnd(this);
+  LiveBegin.assign(NumVars, Inst::NumberSentinel);
+  LiveEnd.assign(NumVars, Inst::NumberSentinel);
+  // Initialize Live to be the union of all successors' LiveIn.
+  for (NodeList::const_iterator I = OutEdges.begin(), E = OutEdges.end();
+       I != E; ++I) {
+    CfgNode *Succ = *I;
+    Live |= Liveness->getLiveIn(Succ);
+    // Mark corresponding argument of phis in successor as live.
+    for (PhiList::const_iterator I1 = Succ->Phis.begin(), E1 = Succ->Phis.end();
+         I1 != E1; ++I1) {
+      (*I1)->livenessPhiOperand(Live, this, Liveness);
+    }
+  }
+  Liveness->getLiveOut(this) = Live;
+  // Process regular instructions in reverse order.
+  for (InstList::const_reverse_iterator I = Insts.rbegin(), E = Insts.rend();
+       I != E; ++I) {
+    if ((*I)->isDeleted())
+      continue;
+    (*I)->liveness((*I)->getNumber(), Live, Liveness, this);
+  }
+  // Process phis in forward order so that we can override the
+  // instruction number to be that of the earliest phi instruction in
+  // the block.
+  InstNumberT FirstPhiNumber = Inst::NumberSentinel;
+  for (PhiList::const_iterator I = Phis.begin(), E = Phis.end(); I != E; ++I) {
+    if ((*I)->isDeleted())
+      continue;
+    if (FirstPhiNumber == Inst::NumberSentinel)
+      FirstPhiNumber = (*I)->getNumber();
+    (*I)->liveness(FirstPhiNumber, Live, Liveness, this);
+  }
+  // When using the sparse representation, after traversing the
+  // instructions in the block, the Live bitvector should only contain
+  // set bits for global variables upon block entry.  We validate this
+  // by shrinking the Live vector and then testing it against the
+  // pre-shrunk version.  (The shrinking is required, but the
+  // validation is not.)
+  llvm::BitVector LiveOrig = Live;
+  Live.resize(Liveness->getNumGlobalVars());
+  // Non-global arguments in the entry node are allowed to be live on
+  // entry.
+  bool IsEntry = (Func->getEntryNode() == this);
+  if (!(IsEntry || Live == LiveOrig)) {
+    // This is a fatal liveness consistency error.  Print some
+    // diagnostics and abort.
+    Ostream &Str = Func->getContext()->getStrDump();
+    Func->setCurrentNode(NULL);
+    Str << "LiveOrig-Live =";
+    for (SizeT i = Live.size(); i < LiveOrig.size(); ++i) {
+      if (LiveOrig.test(i)) {
+        Str << " ";
+        Liveness->getVariable(i, this)->dump(Func);
+      }
+    }
+    Str << "\n";
+    llvm_unreachable("Fatal inconsistency in liveness analysis");
+  }
+  bool Changed = false;
+  llvm::BitVector &LiveIn = Liveness->getLiveIn(this);
+  // Add in current LiveIn
+  Live |= LiveIn;
+  // Check result, set LiveIn=Live
+  Changed = (Live != LiveIn);
+  if (Changed)
+    LiveIn = Live;
+  return Changed;
+}
+// Now that basic liveness is complete, remove dead instructions that
+// were tentatively marked as dead, and compute actual live ranges.
+// It is assumed that within a single basic block, a live range begins
+// at most once and ends at most once.  This is certainly true for
+// pure SSA form.  It is also true once phis are lowered, since each
+// assignment to the phi-based temporary is in a different basic
+// block, and there is a single read that ends the live in the basic
+// block that contained the actual phi instruction.
+void CfgNode::livenessPostprocess(LivenessMode Mode, Liveness *Liveness) {
+  InstNumberT FirstInstNum = Inst::NumberSentinel;
+  InstNumberT LastInstNum = Inst::NumberSentinel;
+  // Process phis in any order.  Process only Dest operands.
+  for (PhiList::const_iterator I = Phis.begin(), E = Phis.end(); I != E; ++I) {
+    InstPhi *Inst = *I;
+    Inst->deleteIfDead();
+    if (Inst->isDeleted())
+      continue;
+    if (FirstInstNum == Inst::NumberSentinel)
+      FirstInstNum = Inst->getNumber();
+    assert(Inst->getNumber() > LastInstNum);
+    LastInstNum = Inst->getNumber();
+  }
+  // Process instructions
+  for (InstList::const_iterator I = Insts.begin(), E = Insts.end(); I != E;
+       ++I) {
+    Inst *Inst = *I;
+    Inst->deleteIfDead();
+    if (Inst->isDeleted())
+      continue;
+    if (FirstInstNum == Inst::NumberSentinel)
+      FirstInstNum = Inst->getNumber();
+    assert(Inst->getNumber() > LastInstNum);
+    LastInstNum = Inst->getNumber();
+    // Create fake live ranges for a Kill instruction, but only if the
+    // linked instruction is still alive.
+    if (Mode == Liveness_Intervals) {
+      if (InstFakeKill *Kill = llvm::dyn_cast<InstFakeKill>(Inst)) {
+        if (!Kill->getLinked()->isDeleted()) {
+          SizeT NumSrcs = Inst->getSrcSize();
+          for (SizeT i = 0; i < NumSrcs; ++i) {
+            Variable *Var = llvm::cast<Variable>(Inst->getSrc(i));
+            InstNumberT InstNumber = Inst->getNumber();
+            Liveness->addLiveRange(Var, InstNumber, InstNumber, 1);
+          }
+        }
+      }
+    }
+  }
+  if (Mode != Liveness_Intervals)
+    return;
+  SizeT NumVars = Liveness->getNumVarsInNode(this);
+  SizeT NumGlobals = Liveness->getNumGlobalVars();
+  llvm::BitVector &LiveIn = Liveness->getLiveIn(this);
+  llvm::BitVector &LiveOut = Liveness->getLiveOut(this);
+  std::vector<InstNumberT> &LiveBegin = Liveness->getLiveBegin(this);
+  std::vector<InstNumberT> &LiveEnd = Liveness->getLiveEnd(this);
+  for (SizeT i = 0; i < NumVars; ++i) {
+    // Deal with the case where the variable is both live-in and
+    // live-out, but LiveEnd comes before LiveBegin.  In this case, we
+    // need to add two segments to the live range because there is a
+    // hole in the middle.  This would typically happen as a result of
+    // phi lowering in the presence of loopback edges.
+    bool IsGlobal = (i < NumGlobals);
+    if (IsGlobal && LiveIn[i] && LiveOut[i] && LiveBegin[i] > LiveEnd[i]) {
+      Variable *Var = Liveness->getVariable(i, this);
+      Liveness->addLiveRange(Var, FirstInstNum, LiveEnd[i], 1);
+      Liveness->addLiveRange(Var, LiveBegin[i], LastInstNum + 1, 1);
+      continue;
+    }
+    InstNumberT Begin = (IsGlobal && LiveIn[i]) ? FirstInstNum : LiveBegin[i];
+    InstNumberT End = (IsGlobal && LiveOut[i]) ? LastInstNum + 1 : LiveEnd[i];
+    if (Begin == Inst::NumberSentinel && End == Inst::NumberSentinel)
+      continue;
+    if (Begin <= FirstInstNum)
+      Begin = FirstInstNum;
+    if (End == Inst::NumberSentinel)
+      End = LastInstNum + 1;
+    Variable *Var = Liveness->getVariable(i, this);
+    Liveness->addLiveRange(Var, Begin, End, 1);
+  }
+}
 // ======================== Dump routines ======================== //
 void CfgNode::emit(Cfg *Func) const {
@@ -194,6 +422,7 @@ void CfgNode::emit(Cfg *Func) const {
 void CfgNode::dump(Cfg *Func) const {
  Func->setCurrentNode(this);
  Ostream &Str = Func->getContext()->getStrDump();
+  Liveness *Liveness = Func->getLiveness();
  if (Func->getContext()->isVerbose(IceV_Instructions)) {
    Str << getName() << ":\n";
  }
@@ -208,6 +437,19 @@ void CfgNode::dump(Cfg *Func) const {
    }
    Str << "\n";
  }
+  // Dump the live-in variables.
+  llvm::BitVector LiveIn;
+  if (Liveness)
+    LiveIn = Liveness->getLiveIn(this);
+  if (Func->getContext()->isVerbose(IceV_Liveness) && !LiveIn.empty()) {
+    Str << "    // LiveIn:";
+    for (SizeT i = 0; i < LiveIn.size(); ++i) {
+      if (LiveIn[i]) {
+        Str << " %" << Liveness->getVariable(i, this)->getName();
+      }
+    }
+    Str << "\n";
+  }
  // Dump each instruction.
  if (Func->getContext()->isVerbose(IceV_Instructions)) {
    for (PhiList::const_iterator I = Phis.begin(), E = Phis.end(); I != E;
@@ -221,6 +463,19 @@ void CfgNode::dump(Cfg *Func) const {
      Inst->dumpDecorated(Func);
    }
  }
+  // Dump the live-out variables.
+  llvm::BitVector LiveOut;
+  if (Liveness)
+    LiveOut = Liveness->getLiveOut(this);
+  if (Func->getContext()->isVerbose(IceV_Liveness) && !LiveOut.empty()) {
+    Str << "    // LiveOut:";
+    for (SizeT i = 0; i < LiveOut.size(); ++i) {
+      if (LiveOut[i]) {
+        Str << " %" << Liveness->getVariable(i, this)->getName();
+      }
+    }
+    Str << "\n";
+  }
  // Dump list of successor nodes.
  if (Func->getContext()->isVerbose(IceV_Succs)) {
    Str << "    // succs = ";

--- a/src/IceCfgNode.h
+++ b/src/IceCfgNode.h
@@ -45,6 +45,7 @@ public:
  // Manage the instruction list.
  InstList &getInsts() { return Insts; }
  void appendInst(Inst *Inst);
+  void renumberInstructions();
  // Add a predecessor edge to the InEdges list for each of this
  // node's successors.
@@ -53,7 +54,11 @@ public:
  void placePhiLoads();
  void placePhiStores();
  void deletePhis();
+  void doAddressOpt();
  void genCode();
+  void livenessLightweight();
+  bool liveness(Liveness *Liveness);
+  void livenessPostprocess(LivenessMode Mode, Liveness *Liveness);
  void emit(Cfg *Func) const;
  void dump(Cfg *Func) const;

--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -50,6 +50,8 @@ class GlobalContext;
 class Inst;
 class InstPhi;
 class InstTarget;
+class LiveRange;
+class Liveness;
 class Operand;
 class TargetLowering;
 class Variable;
@@ -68,6 +70,23 @@ typedef std::vector<Constant *> ConstantList;
 // may be 64-bits wide) when we want to save space.
 typedef uint32_t SizeT;
+// InstNumberT is for holding an instruction number.  Instruction
+// numbers are used for representing Variable live ranges.
+typedef int32_t InstNumberT;
+enum LivenessMode {
+  // Basic version of live-range-end calculation.  Marks the last uses
+  // of variables based on dataflow analysis.  Records the set of
+  // live-in and live-out variables for each block.  Identifies and
+  // deletes dead instructions (primarily stores).
+  Liveness_Basic,
+  // In addition to Liveness_Basic, also calculate the complete
+  // live range for each variable in a form suitable for interference
+  // calculation and register allocation.
+  Liveness_Intervals
+};
 enum VerboseItem {
  IceV_None = 0,
  IceV_Instructions = 1 << 0,

--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -12,6 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
+#include <ctype.h> // isdigit()
 #include "IceDefs.h"
 #include "IceTypes.h"
 #include "IceCfg.h"
@@ -129,8 +131,14 @@ IceString GlobalContext::mangleName(const IceString &Name) const {
    return NewName;
  }
-  ItemsParsed = sscanf(Name.c_str(), "_Z%u%s", &BaseLength, NameBase);
+  // Artificially limit BaseLength to 9 digits (less than 1 billion)
-  if (ItemsParsed == 2 && BaseLength <= strlen(NameBase)) {
+  // because sscanf behavior is undefined on integer overflow.  If
+  // there are more than 9 digits (which we test by looking at the
+  // beginning of NameBase), then we consider this a failure to parse
+  // a namespace mangling, and fall back to the simple prefixing.
+  ItemsParsed = sscanf(Name.c_str(), "_Z%9u%s", &BaseLength, NameBase);
+  if (ItemsParsed == 2 && BaseLength <= strlen(NameBase) &&
+      !isdigit(NameBase[0])) {
    // Transform _Z3barxyz ==> _ZN6Prefix3barExyz
    //                           ^^^^^^^^    ^
    // (splice in "N6Prefix", and insert "E" after "3bar")

--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -15,6 +15,7 @@
 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceInst.h"
+#include "IceLiveness.h"
 #include "IceOperand.h"
 namespace Ice {
@@ -74,25 +75,144 @@ const size_t InstIcmpAttributesSize = llvm::array_lengthof(InstIcmpAttributes);
 } // end of anonymous namespace
 Inst::Inst(Cfg *Func, InstKind Kind, SizeT MaxSrcs, Variable *Dest)
-    : Kind(Kind), Number(Func->newInstNumber()), Deleted(false),
+    : Kind(Kind), Number(Func->newInstNumber()), Deleted(false), Dead(false),
      HasSideEffects(false), Dest(Dest), MaxSrcs(MaxSrcs), NumSrcs(0),
-      Srcs(Func->allocateArrayOf<Operand *>(MaxSrcs)) {}
+      Srcs(Func->allocateArrayOf<Operand *>(MaxSrcs)), LiveRangesEnded(0) {}
+// Assign the instruction a new number.
+void Inst::renumber(Cfg *Func) {
+  Number = isDeleted() ? NumberDeleted : Func->newInstNumber();
+}
+// Delete the instruction if its tentative Dead flag is still set
+// after liveness analysis.
+void Inst::deleteIfDead() {
+  if (Dead)
+    setDeleted();
+}
+// If Src is a Variable, it returns true if this instruction ends
+// Src's live range.  Otherwise, returns false.
+bool Inst::isLastUse(const Operand *TestSrc) const {
+  if (LiveRangesEnded == 0)
+    return false; // early-exit optimization
+  if (const Variable *TestVar = llvm::dyn_cast<const Variable>(TestSrc)) {
+    LREndedBits Mask = LiveRangesEnded;
+    for (SizeT I = 0; I < getSrcSize(); ++I) {
+      Operand *Src = getSrc(I);
+      SizeT NumVars = Src->getNumVars();
+      for (SizeT J = 0; J < NumVars; ++J) {
+        const Variable *Var = Src->getVar(J);
+        if (Var == TestVar) {
+          // We've found where the variable is used in the instruction.
+          return Mask & 1;
+        }
+        Mask >>= 1;
+        if (Mask == 0)
+          return false; // another early-exit optimization
+      }
+    }
+  }
+  return false;
+}
 void Inst::updateVars(CfgNode *Node) {
  if (Dest)
    Dest->setDefinition(this, Node);
-  SizeT VarIndex = 0;
  for (SizeT I = 0; I < getSrcSize(); ++I) {
    Operand *Src = getSrc(I);
    SizeT NumVars = Src->getNumVars();
-    for (SizeT J = 0; J < NumVars; ++J, ++VarIndex) {
+    for (SizeT J = 0; J < NumVars; ++J) {
      Variable *Var = Src->getVar(J);
      Var->setUse(this, Node);
    }
  }
 }
+void Inst::livenessLightweight(llvm::BitVector &Live) {
+  assert(!isDeleted());
+  if (llvm::isa<InstFakeKill>(this))
+    return;
+  resetLastUses();
+  SizeT VarIndex = 0;
+  for (SizeT I = 0; I < getSrcSize(); ++I) {
+    Operand *Src = getSrc(I);
+    SizeT NumVars = Src->getNumVars();
+    for (SizeT J = 0; J < NumVars; ++J, ++VarIndex) {
+      const Variable *Var = Src->getVar(J);
+      if (Var->isMultiblockLife())
+        continue;
+      SizeT Index = Var->getIndex();
+      if (Live[Index])
+        continue;
+      Live[Index] = true;
+      setLastUse(VarIndex);
+    }
+  }
+}
+void Inst::liveness(InstNumberT InstNumber, llvm::BitVector &Live,
+                    Liveness *Liveness, const CfgNode *Node) {
+  assert(!isDeleted());
+  if (llvm::isa<InstFakeKill>(this))
+    return;
+  std::vector<InstNumberT> &LiveBegin = Liveness->getLiveBegin(Node);
+  std::vector<InstNumberT> &LiveEnd = Liveness->getLiveEnd(Node);
+  Dead = false;
+  if (Dest) {
+    SizeT VarNum = Liveness->getLiveIndex(Dest);
+    if (Live[VarNum]) {
+      Live[VarNum] = false;
+      LiveBegin[VarNum] = InstNumber;
+    } else {
+      if (!hasSideEffects())
+        Dead = true;
+    }
+  }
+  if (Dead)
+    return;
+  // Phi arguments only get added to Live in the predecessor node, but
+  // we still need to update LiveRangesEnded.
+  bool IsPhi = llvm::isa<InstPhi>(this);
+  resetLastUses();
+  SizeT VarIndex = 0;
+  for (SizeT I = 0; I < getSrcSize(); ++I) {
+    Operand *Src = getSrc(I);
+    SizeT NumVars = Src->getNumVars();
+    for (SizeT J = 0; J < NumVars; ++J, ++VarIndex) {
+      const Variable *Var = Src->getVar(J);
+      SizeT VarNum = Liveness->getLiveIndex(Var);
+      if (!Live[VarNum]) {
+        setLastUse(VarIndex);
+        if (!IsPhi) {
+          Live[VarNum] = true;
+          // For a variable in SSA form, its live range can end at
+          // most once in a basic block.  However, after lowering to
+          // two-address instructions, we end up with sequences like
+          // "t=b;t+=c;a=t" where t's live range begins and ends
+          // twice.  ICE only allows a variable to have a single
+          // liveness interval in a basic block (except for blocks
+          // where a variable is live-in and live-out but there is a
+          // gap in the middle, and except for the special
+          // InstFakeKill instruction that can appear multiple
+          // times in the same block).  Therefore, this lowered
+          // sequence needs to represent a single conservative live
+          // range for t.  Since the instructions are being traversed
+          // backwards, we make sure LiveEnd is only set once by
+          // setting it only when LiveEnd[VarNum]==0 (sentinel value).
+          // Note that it's OK to set LiveBegin multiple times because
+          // of the backwards traversal.
+          if (LiveEnd[VarNum] == 0) {
+            LiveEnd[VarNum] = InstNumber;
+          }
+        }
+      }
+    }
+  }
+}
 InstAlloca::InstAlloca(Cfg *Func, Operand *ByteCount, uint32_t AlignInBytes,
                       Variable *Dest)
    : Inst(Func, Inst::Alloca, 1, Dest), AlignInBytes(AlignInBytes) {
@@ -192,6 +312,28 @@ Operand *InstPhi::getOperandForTarget(CfgNode *Target) const {
  return NULL;
 }
+// Updates liveness for a particular operand based on the given
+// predecessor edge.  Doesn't mark the operand as live if the Phi
+// instruction is dead or deleted.
+void InstPhi::livenessPhiOperand(llvm::BitVector &Live, CfgNode *Target,
+                                 Liveness *Liveness) {
+  if (isDeleted() || Dead)
+    return;
+  for (SizeT I = 0; I < getSrcSize(); ++I) {
+    if (Labels[I] == Target) {
+      if (Variable *Var = llvm::dyn_cast<Variable>(getSrc(I))) {
+        SizeT SrcIndex = Liveness->getLiveIndex(Var);
+        if (!Live[SrcIndex]) {
+          setLastUse(I);
+          Live[SrcIndex] = true;
+        }
+      }
+      return;
+    }
+  }
+  llvm_unreachable("Phi operand not found for specified target node");
+}
 // Change "a=phi(...)" to "a_phi=phi(...)" and return a new
 // instruction "a=a_phi".
 Inst *InstPhi::lower(Cfg *Func, CfgNode *Node) {
@@ -294,8 +436,8 @@ void Inst::dumpDecorated(const Cfg *Func) const {
    return;
  if (Func->getContext()->isVerbose(IceV_InstNumbers)) {
    char buf[30];
-    int32_t Number = getNumber();
+    InstNumberT Number = getNumber();
-    if (Number < 0)
+    if (Number == NumberDeleted)
      snprintf(buf, llvm::array_lengthof(buf), "[XXX]");
    else
      snprintf(buf, llvm::array_lengthof(buf), "[%3d]", Number);
@@ -305,6 +447,7 @@ void Inst::dumpDecorated(const Cfg *Func) const {
  if (isDeleted())
    Str << "  //";
  dump(Func);
+  dumpExtras(Func);
  Str << "\n";
 }
@@ -319,6 +462,32 @@ void Inst::dump(const Cfg *Func) const {
  dumpSources(Func);
 }
+void Inst::dumpExtras(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  bool First = true;
+  // Print "LIVEEND={a,b,c}" for all source operands whose live ranges
+  // are known to end at this instruction.
+  if (Func->getContext()->isVerbose(IceV_Liveness)) {
+    for (SizeT I = 0; I < getSrcSize(); ++I) {
+      Operand *Src = getSrc(I);
+      SizeT NumVars = Src->getNumVars();
+      for (SizeT J = 0; J < NumVars; ++J) {
+        const Variable *Var = Src->getVar(J);
+        if (isLastUse(Var)) {
+          if (First)
+            Str << " // LIVEEND={";
+          else
+            Str << ",";
+          Var->dump(Func);
+          First = false;
+        }
+      }
+    }
+    if (!First)
+      Str << "}";
+  }
+}
 void Inst::dumpSources(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrDump();
  for (SizeT I = 0; I < getSrcSize(); ++I) {
@@ -553,4 +722,6 @@ void InstTarget::dump(const Cfg *Func) const {
  Inst::dump(Func);
 }
+void InstTarget::dumpExtras(const Cfg *Func) const { Inst::dumpExtras(Func); }
 } // end of namespace Ice
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -56,10 +56,14 @@ public:
  };
  InstKind getKind() const { return Kind; }
-  int32_t getNumber() const { return Number; }
+  InstNumberT getNumber() const { return Number; }
+  void renumber(Cfg *Func);
+  static const InstNumberT NumberDeleted = -1;
+  static const InstNumberT NumberSentinel = 0;
  bool isDeleted() const { return Deleted; }
  void setDeleted() { Deleted = true; }
+  void deleteIfDead();
  bool hasSideEffects() const { return HasSideEffects; }
@@ -71,6 +75,8 @@ public:
    return Srcs[I];
  }
+  bool isLastUse(const Operand *Src) const;
  // Returns a list of out-edges corresponding to a terminator
  // instruction, which is the last instruction of the block.
  virtual NodeList getTerminatorEdges() const {
@@ -88,8 +94,12 @@ public:
  // basic blocks, i.e. used in a different block from their definition.
  void updateVars(CfgNode *Node);
+  void livenessLightweight(llvm::BitVector &Live);
+  void liveness(InstNumberT InstNumber, llvm::BitVector &Live,
+                Liveness *Liveness, const CfgNode *Node);
  virtual void emit(const Cfg *Func) const;
  virtual void dump(const Cfg *Func) const;
+  virtual void dumpExtras(const Cfg *Func) const;
  void dumpDecorated(const Cfg *Func) const;
  void emitSources(const Cfg *Func) const;
  void dumpSources(const Cfg *Func) const;
@@ -105,15 +115,22 @@ protected:
    assert(NumSrcs < MaxSrcs);
    Srcs[NumSrcs++] = Src;
  }
+  void setLastUse(SizeT VarIndex) {
+    if (VarIndex < CHAR_BIT * sizeof(LiveRangesEnded))
+      LiveRangesEnded |= (((LREndedBits)1u) << VarIndex);
+  }
+  void resetLastUses() { LiveRangesEnded = 0; }
  // The destroy() method lets the instruction cleanly release any
  // memory that was allocated via the Cfg's allocator.
  virtual void destroy(Cfg *Func) { Func->deallocateArrayOf<Operand *>(Srcs); }
  const InstKind Kind;
  // Number is the instruction number for describing live ranges.
-  int32_t Number;
+  InstNumberT Number;
  // Deleted means irrevocably deleted.
  bool Deleted;
+  // Dead means pending deletion after liveness analysis converges.
+  bool Dead;
  // HasSideEffects means the instruction is something like a function
  // call or a volatile load that can't be removed even if its Dest
  // variable is not live.
@@ -124,6 +141,18 @@ protected:
  SizeT NumSrcs;
  Operand **Srcs;
+  // LiveRangesEnded marks which Variables' live ranges end in this
+  // instruction.  An instruction can have an arbitrary number of
+  // source operands (e.g. a call instruction), and each source
+  // operand can contain 0 or 1 Variable (and target-specific operands
+  // could contain more than 1 Variable).  All the variables in an
+  // instruction are conceptually flattened and each variable is
+  // mapped to one bit position of the LiveRangesEnded bit vector.
+  // Only the first CHAR_BIT * sizeof(LREndedBits) variables are
+  // tracked this way.
+  typedef uint32_t LREndedBits; // only first 32 src operands tracked, sorry
+  LREndedBits LiveRangesEnded;
 private:
  Inst(const Inst &) LLVM_DELETED_FUNCTION;
  Inst &operator=(const Inst &) LLVM_DELETED_FUNCTION;
@@ -393,6 +422,8 @@ public:
  }
  void addArgument(Operand *Source, CfgNode *Label);
  Operand *getOperandForTarget(CfgNode *Target) const;
+  void livenessPhiOperand(llvm::BitVector &Live, CfgNode *Target,
+                          Liveness *Liveness);
  Inst *lower(Cfg *Func, CfgNode *Node);
  virtual void dump(const Cfg *Func) const;
  static bool classof(const Inst *Inst) { return Inst->getKind() == Phi; }
@@ -626,6 +657,7 @@ class InstTarget : public Inst {
 public:
  virtual void emit(const Cfg *Func) const = 0;
  virtual void dump(const Cfg *Func) const;
+  virtual void dumpExtras(const Cfg *Func) const;
  static bool classof(const Inst *Inst) { return Inst->getKind() >= Target; }
 protected:

--- a/src/IceLiveness.cpp
+++ b/src/IceLiveness.cpp
+//===- subzero/src/IceLiveness.cpp - Liveness analysis implementation -----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides some of the support for the Liveness class.  In
+// particular, it handles the sparsity representation of the mapping
+// between Variables and CfgNodes.  The idea is that since most
+// variables are used only within a single basic block, we can
+// partition the variables into "local" and "global" sets.  Instead of
+// sizing and indexing vectors according to Variable::Number, we
+// create a mapping such that global variables are mapped to low
+// indexes that are common across nodes, and local variables are
+// mapped to a higher index space that is shared across nodes.
+//
+//===----------------------------------------------------------------------===//
+#include "IceDefs.h"
+#include "IceCfg.h"
+#include "IceCfgNode.h"
+#include "IceInst.h"
+#include "IceLiveness.h"
+#include "IceOperand.h"
+namespace Ice {
+void Liveness::init() {
+  // Initialize most of the container sizes.
+  SizeT NumVars = Func->getVariables().size();
+  SizeT NumNodes = Func->getNumNodes();
+  Nodes.resize(NumNodes);
+  VarToLiveMap.resize(NumVars);
+  if (Mode == Liveness_Intervals)
+    LiveRanges.resize(NumVars);
+  // Count the number of globals, and the number of locals for each
+  // block.
+  for (SizeT i = 0; i < NumVars; ++i) {
+    Variable *Var = Func->getVariables()[i];
+    if (Var->isMultiblockLife()) {
+      ++NumGlobals;
+    } else {
+      SizeT Index = Var->getLocalUseNode()->getIndex();
+      ++Nodes[Index].NumLocals;
+    }
+  }
+  // Resize each LivenessNode::LiveToVarMap, and the global
+  // LiveToVarMap.  Reset the counts to 0.
+  for (SizeT i = 0; i < NumNodes; ++i) {
+    Nodes[i].LiveToVarMap.assign(Nodes[i].NumLocals, NULL);
+    Nodes[i].NumLocals = 0;
+  }
+  LiveToVarMap.assign(NumGlobals, NULL);
+  // Sort each variable into the appropriate LiveToVarMap.  Also set
+  // VarToLiveMap.
+  SizeT TmpNumGlobals = 0;
+  for (SizeT i = 0; i < NumVars; ++i) {
+    Variable *Var = Func->getVariables()[i];
+    SizeT VarIndex = Var->getIndex();
+    SizeT LiveIndex;
+    if (Var->isMultiblockLife()) {
+      LiveIndex = TmpNumGlobals++;
+      LiveToVarMap[LiveIndex] = Var;
+    } else {
+      SizeT NodeIndex = Var->getLocalUseNode()->getIndex();
+      LiveIndex = Nodes[NodeIndex].NumLocals++;
+      Nodes[NodeIndex].LiveToVarMap[LiveIndex] = Var;
+      LiveIndex += NumGlobals;
+    }
+    VarToLiveMap[VarIndex] = LiveIndex;
+  }
+  assert(NumGlobals == TmpNumGlobals);
+  // Process each node.
+  const NodeList &LNodes = Func->getNodes();
+  SizeT NumLNodes = LNodes.size();
+  for (SizeT i = 0; i < NumLNodes; ++i) {
+    LivenessNode &Node = Nodes[LNodes[i]->getIndex()];
+    // NumLocals, LiveToVarMap already initialized
+    Node.LiveIn.resize(NumGlobals);
+    Node.LiveOut.resize(NumGlobals);
+    // LiveBegin and LiveEnd are reinitialized before each pass over
+    // the block.
+  }
+}
+Variable *Liveness::getVariable(SizeT LiveIndex, const CfgNode *Node) const {
+  if (LiveIndex < NumGlobals)
+    return LiveToVarMap[LiveIndex];
+  SizeT NodeIndex = Node->getIndex();
+  return Nodes[NodeIndex].LiveToVarMap[LiveIndex - NumGlobals];
+}
+SizeT Liveness::getLiveIndex(const Variable *Var) const {
+  return VarToLiveMap[Var->getIndex()];
+}
+void Liveness::addLiveRange(Variable *Var, InstNumberT Start, InstNumberT End,
+                            uint32_t WeightDelta) {
+  LiveRange &LiveRange = LiveRanges[Var->getIndex()];
+  assert(WeightDelta != RegWeight::Inf);
+  LiveRange.addSegment(Start, End);
+  LiveRange.addWeight(WeightDelta);
+}
+LiveRange &Liveness::getLiveRange(Variable *Var) {
+  return LiveRanges[Var->getIndex()];
+}
+} // end of namespace Ice
--- a/src/IceLiveness.h
+++ b/src/IceLiveness.h
+//===- subzero/src/IceLiveness.h - Liveness analysis ------------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Liveness and LivenessNode classes,
+// which are used for liveness analysis.  The node-specific
+// information tracked for each Variable includes whether it is
+// live on entry, whether it is live on exit, the instruction number
+// that starts its live range, and the instruction number that ends
+// its live range.  At the Cfg level, the actual live intervals are
+// recorded.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SUBZERO_SRC_ICELIVENESS_H
+#define SUBZERO_SRC_ICELIVENESS_H
+#include "IceDefs.h"
+#include "IceTypes.h"
+namespace Ice {
+class LivenessNode {
+public:
+  LivenessNode() : NumLocals(0) {}
+  // NumLocals is the number of Variables local to this block.
+  SizeT NumLocals;
+  // LiveToVarMap maps a liveness bitvector index to a Variable.  This
+  // is generally just for printing/dumping.  The index should be less
+  // than NumLocals + Liveness::NumGlobals.
+  std::vector<Variable *> LiveToVarMap;
+  // LiveIn and LiveOut track the in- and out-liveness of the global
+  // variables.  The size of each vector is
+  // LivenessNode::NumGlobals.
+  llvm::BitVector LiveIn, LiveOut;
+  // LiveBegin and LiveEnd track the instruction numbers of the start
+  // and end of each variable's live range within this block.  The
+  // size of each vector is NumLocals + Liveness::NumGlobals.
+  std::vector<InstNumberT> LiveBegin, LiveEnd;
+private:
+  // TODO: Disable these constructors when Liveness::Nodes is no
+  // longer an STL container.
+  // LivenessNode(const LivenessNode &) LLVM_DELETED_FUNCTION;
+  // LivenessNode &operator=(const LivenessNode &) LLVM_DELETED_FUNCTION;
+};
+class Liveness {
+public:
+  Liveness(Cfg *Func, LivenessMode Mode)
+      : Func(Func), Mode(Mode), NumGlobals(0) {}
+  void init();
+  Variable *getVariable(SizeT LiveIndex, const CfgNode *Node) const;
+  SizeT getLiveIndex(const Variable *Var) const;
+  SizeT getNumGlobalVars() const { return NumGlobals; }
+  SizeT getNumVarsInNode(const CfgNode *Node) const {
+    return NumGlobals + Nodes[Node->getIndex()].NumLocals;
+  }
+  llvm::BitVector &getLiveIn(const CfgNode *Node) {
+    return Nodes[Node->getIndex()].LiveIn;
+  }
+  llvm::BitVector &getLiveOut(const CfgNode *Node) {
+    return Nodes[Node->getIndex()].LiveOut;
+  }
+  std::vector<InstNumberT> &getLiveBegin(const CfgNode *Node) {
+    return Nodes[Node->getIndex()].LiveBegin;
+  }
+  std::vector<InstNumberT> &getLiveEnd(const CfgNode *Node) {
+    return Nodes[Node->getIndex()].LiveEnd;
+  }
+  LiveRange &getLiveRange(Variable *Var);
+  void addLiveRange(Variable *Var, InstNumberT Start, InstNumberT End,
+                    uint32_t WeightDelta);
+private:
+  Cfg *Func;
+  LivenessMode Mode;
+  SizeT NumGlobals;
+  // Size of Nodes is Cfg::Nodes.size().
+  std::vector<LivenessNode> Nodes;
+  // VarToLiveMap maps a Variable's Variable::Number to its live index
+  // within its basic block.
+  std::vector<SizeT> VarToLiveMap;
+  // LiveToVarMap is analogous to LivenessNode::LiveToVarMap, but for
+  // non-local variables.
+  std::vector<Variable *> LiveToVarMap;
+  // LiveRanges maps a Variable::Number to its live range.
+  std::vector<LiveRange> LiveRanges;
+  Liveness(const Liveness &) LLVM_DELETED_FUNCTION;
+  Liveness &operator=(const Liveness &) LLVM_DELETED_FUNCTION;
+};
+} // end of namespace Ice
+#endif // SUBZERO_SRC_ICELIVENESS_H
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the Operand class and its
+// This file implements the Operand class and its target-independent
-// target-independent subclasses, primarily for the methods of the
+// subclasses, primarily for the methods of the Variable class.
-// Variable class.
 //
 //===----------------------------------------------------------------------===//
@@ -36,6 +35,109 @@ bool operator==(const RegWeight &A, const RegWeight &B) {
  return !(B < A) && !(A < B);
 }
+void LiveRange::addSegment(InstNumberT Start, InstNumberT End) {
+#ifdef USE_SET
+  RangeElementType Element(Start, End);
+  RangeType::iterator Next = Range.lower_bound(Element);
+  assert(Next == Range.upper_bound(Element)); // Element not already present
+  // Beginning of code that merges contiguous segments.  TODO: change
+  // "if(true)" to "if(false)" to see if this extra optimization code
+  // gives any performance gain, or is just destabilizing.
+  if (true) {
+    RangeType::iterator FirstDelete = Next;
+    RangeType::iterator Prev = Next;
+    bool hasPrev = (Next != Range.begin());
+    bool hasNext = (Next != Range.end());
+    if (hasPrev)
+      --Prev;
+    // See if Element and Next should be joined.
+    if (hasNext && End == Next->first) {
+      Element.second = Next->second;
+      ++Next;
+    }
+    // See if Prev and Element should be joined.
+    if (hasPrev && Prev->second == Start) {
+      Element.first = Prev->first;
+      FirstDelete = Prev;
+    }
+    Range.erase(FirstDelete, Next);
+  }
+  // End of code that merges contiguous segments.
+  Range.insert(Next, Element);
+#else
+  if (Range.empty()) {
+    Range.push_back(RangeElementType(Start, End));
+    return;
+  }
+  // Special case for faking in-arg liveness.
+  if (End < Range.front().first) {
+    assert(Start < 0);
+    Range.push_front(RangeElementType(Start, End));
+    return;
+  }
+  InstNumberT CurrentEnd = Range.back().second;
+  assert(Start >= CurrentEnd);
+  // Check for merge opportunity.
+  if (Start == CurrentEnd) {
+    Range.back().second = End;
+    return;
+  }
+  Range.push_back(RangeElementType(Start, End));
+#endif
+}
+// Returns true if this live range ends before Other's live range
+// starts.  This means that the highest instruction number in this
+// live range is less than or equal to the lowest instruction number
+// of the Other live range.
+bool LiveRange::endsBefore(const LiveRange &Other) const {
+  // Neither range should be empty, but let's be graceful.
+  if (Range.empty() || Other.Range.empty())
+    return true;
+  InstNumberT MyEnd = (*Range.rbegin()).second;
+  InstNumberT OtherStart = (*Other.Range.begin()).first;
+  return MyEnd <= OtherStart;
+}
+// Returns true if there is any overlap between the two live ranges.
+bool LiveRange::overlaps(const LiveRange &Other) const {
+  // Do a two-finger walk through the two sorted lists of segments.
+  RangeType::const_iterator I1 = Range.begin(), I2 = Other.Range.begin();
+  RangeType::const_iterator E1 = Range.end(), E2 = Other.Range.end();
+  while (I1 != E1 && I2 != E2) {
+    if (I1->second <= I2->first) {
+      ++I1;
+      continue;
+    }
+    if (I2->second <= I1->first) {
+      ++I2;
+      continue;
+    }
+    return true;
+  }
+  return false;
+}
+bool LiveRange::overlaps(InstNumberT OtherBegin) const {
+  LiveRange Temp;
+  Temp.addSegment(OtherBegin, OtherBegin + 1);
+  return overlaps(Temp);
+}
+// Returns true if the live range contains the given instruction
+// number.  This is only used for validating the live range
+// calculation.
+bool LiveRange::containsValue(InstNumberT Value) const {
+  for (RangeType::const_iterator I = Range.begin(), E = Range.end(); I != E;
+       ++I) {
+    if (I->first <= Value && Value <= I->second)
+      return true;
+  }
+  return false;
+}
 void Variable::setUse(const Inst *Inst, const CfgNode *Node) {
  if (DefNode == NULL)
    return;
@@ -115,12 +217,12 @@ void Variable::dump(const Cfg *Func) const {
  }
 }
-void ConstantRelocatable::emit(const Cfg *Func) const {
+void ConstantRelocatable::emit(GlobalContext *Ctx) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
+  Ostream &Str = Ctx->getStrEmit();
  if (SuppressMangling)
    Str << Name;
  else
-    Str << Func->getContext()->mangleName(Name);
+    Str << Ctx->mangleName(Name);
  if (Offset) {
    if (Offset > 0)
      Str << "+";
@@ -128,13 +230,28 @@ void ConstantRelocatable::emit(const Cfg *Func) const {
  }
 }
-void ConstantRelocatable::dump(const Cfg *Func) const {
+void ConstantRelocatable::dump(GlobalContext *Ctx) const {
-  Ostream &Str = Func->getContext()->getStrDump();
+  Ostream &Str = Ctx->getStrDump();
  Str << "@" << Name;
  if (Offset)
    Str << "+" << Offset;
 }
+void LiveRange::dump(Ostream &Str) const {
+  Str << "(weight=" << Weight << ") ";
+  for (RangeType::const_iterator I = Range.begin(), E = Range.end(); I != E;
+       ++I) {
+    if (I != Range.begin())
+      Str << ", ";
+    Str << "[" << (*I).first << ":" << (*I).second << ")";
+  }
+}
+Ostream &operator<<(Ostream &Str, const LiveRange &L) {
+  L.dump(Str);
+  return Str;
+}
 Ostream &operator<<(Ostream &Str, const RegWeight &W) {
  if (W.getWeight() == RegWeight::Inf)
    Str << "Inf";

--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -81,8 +81,10 @@ private:
 class Constant : public Operand {
 public:
  uint32_t getPoolEntryID() const { return PoolEntryID; }
-  virtual void emit(const Cfg *Func) const = 0;
+  virtual void emit(const Cfg *Func) const { emit(Func->getContext()); }
-  virtual void dump(const Cfg *Func) const = 0;
+  virtual void dump(const Cfg *Func) const { dump(Func->getContext()); }
+  virtual void emit(GlobalContext *Ctx) const = 0;
+  virtual void dump(GlobalContext *Ctx) const = 0;
  static bool classof(const Operand *Operand) {
    OperandKind Kind = Operand->getKind();
@@ -116,12 +118,14 @@ public:
        ConstantPrimitive(Ty, Value, PoolEntryID);
  }
  T getValue() const { return Value; }
-  virtual void emit(const Cfg *Func) const {
+  using Constant::emit;
-    Ostream &Str = Func->getContext()->getStrEmit();
+  virtual void emit(GlobalContext *Ctx) const {
+    Ostream &Str = Ctx->getStrEmit();
    Str << getValue();
  }
-  virtual void dump(const Cfg *Func) const {
+  using Constant::dump;
-    Ostream &Str = Func->getContext()->getStrDump();
+  virtual void dump(GlobalContext *Ctx) const {
+    Ostream &Str = Ctx->getStrDump();
    Str << getValue();
  }
@@ -178,8 +182,10 @@ public:
  IceString getName() const { return Name; }
  void setSuppressMangling(bool Value) { SuppressMangling = Value; }
  bool getSuppressMangling() const { return SuppressMangling; }
-  virtual void emit(const Cfg *Func) const;
+  using Constant::emit;
-  virtual void dump(const Cfg *Func) const;
+  using Constant::dump;
+  virtual void emit(GlobalContext *Ctx) const;
+  virtual void dump(GlobalContext *Ctx) const;
  static bool classof(const Operand *Operand) {
    OperandKind Kind = Operand->getKind();
@@ -228,6 +234,55 @@ bool operator<(const RegWeight &A, const RegWeight &B);
 bool operator<=(const RegWeight &A, const RegWeight &B);
 bool operator==(const RegWeight &A, const RegWeight &B);
+// LiveRange is a set of instruction number intervals representing
+// a variable's live range.  Generally there is one interval per basic
+// block where the variable is live, but adjacent intervals get
+// coalesced into a single interval.  LiveRange also includes a
+// weight, in case e.g. we want a live range to have higher weight
+// inside a loop.
+class LiveRange {
+public:
+  LiveRange() : Weight(0) {}
+  void reset() {
+    Range.clear();
+    Weight.setWeight(0);
+  }
+  void addSegment(InstNumberT Start, InstNumberT End);
+  bool endsBefore(const LiveRange &Other) const;
+  bool overlaps(const LiveRange &Other) const;
+  bool overlaps(InstNumberT OtherBegin) const;
+  bool containsValue(InstNumberT Value) const;
+  bool isEmpty() const { return Range.empty(); }
+  InstNumberT getStart() const {
+    return Range.empty() ? -1 : Range.begin()->first;
+  }
+  RegWeight getWeight() const { return Weight; }
+  void setWeight(const RegWeight &NewWeight) { Weight = NewWeight; }
+  void addWeight(uint32_t Delta) { Weight.addWeight(Delta); }
+  void dump(Ostream &Str) const;
+  // Defining USE_SET uses std::set to hold the segments instead of
+  // std::list.  Using std::list will be slightly faster, but is more
+  // restrictive because new segments cannot be added in the middle.
+  //#define USE_SET
+private:
+  typedef std::pair<InstNumberT, InstNumberT> RangeElementType;
+#ifdef USE_SET
+  typedef std::set<RangeElementType> RangeType;
+#else
+  typedef std::list<RangeElementType> RangeType;
+#endif
+  RangeType Range;
+  RegWeight Weight;
+};
+Ostream &operator<<(Ostream &Str, const LiveRange &L);
 // Variable represents an operand that is register-allocated or
 // stack-allocated.  If it is register-allocated, it will ultimately
 // have a non-negative RegNum field.
@@ -263,6 +318,9 @@ public:
    assert(!hasReg() || RegNum == NewRegNum);
    RegNum = NewRegNum;
  }
+  bool hasRegTmp() const { return getRegNumTmp() != NoRegister; }
+  int32_t getRegNumTmp() const { return RegNumTmp; }
+  void setRegNumTmp(int32_t NewRegNum) { RegNumTmp = NewRegNum; }
  RegWeight getWeight() const { return Weight; }
  void setWeight(uint32_t NewWeight) { Weight = NewWeight; }
@@ -275,6 +333,19 @@ public:
    AllowRegisterOverlap = Overlap;
  }
+  const LiveRange &getLiveRange() const { return Live; }
+  void setLiveRange(const LiveRange &Range) { Live = Range; }
+  void resetLiveRange() { Live.reset(); }
+  void addLiveRange(InstNumberT Start, InstNumberT End, uint32_t WeightDelta) {
+    assert(WeightDelta != RegWeight::Inf);
+    Live.addSegment(Start, End);
+    if (Weight.isInf())
+      Live.setWeight(RegWeight::Inf);
+    else
+      Live.addWeight(WeightDelta * Weight.getWeight());
+  }
+  void setLiveRangeInfiniteWeight() { Live.setWeight(RegWeight::Inf); }
  Variable *getLo() const { return LoVar; }
  Variable *getHi() const { return HiVar; }
  void setLoHi(Variable *Lo, Variable *Hi) {
@@ -304,8 +375,8 @@ private:
  Variable(Type Ty, const CfgNode *Node, SizeT Index, const IceString &Name)
      : Operand(kVariable, Ty), Number(Index), Name(Name), DefInst(NULL),
        DefNode(Node), IsArgument(false), StackOffset(0), RegNum(NoRegister),
-        Weight(1), RegisterPreference(NULL), AllowRegisterOverlap(false),
+        RegNumTmp(NoRegister), Weight(1), RegisterPreference(NULL),
-        LoVar(NULL), HiVar(NULL) {
+        AllowRegisterOverlap(false), LoVar(NULL), HiVar(NULL) {
    Vars = VarsReal;
    Vars[0] = this;
    NumVars = 1;
@@ -334,6 +405,8 @@ private:
  // RegNum is the allocated register, or NoRegister if it isn't
  // register-allocated.
  int32_t RegNum;
+  // RegNumTmp is the tentative assignment during register allocation.
+  int32_t RegNumTmp;
  RegWeight Weight; // Register allocation priority
  // RegisterPreference says that if possible, the register allocator
  // should prefer the register that was assigned to this linked
@@ -345,6 +418,7 @@ private:
  // RegisterPreference and "share" a register even if the two live
  // ranges overlap.
  bool AllowRegisterOverlap;
+  LiveRange Live;
  // LoVar and HiVar are needed for lowering from 64 to 32 bits.  When
  // lowering from I64 to I32 on a 32-bit architecture, we split the
  // variable into two machine-size pieces.  LoVar is the low-order

--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
+//===- subzero/src/IceRegAlloc.cpp - Linear-scan implementation -----------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LinearScan class, which performs the
+// linear-scan register allocation after liveness analysis has been
+// performed.
+//
+//===----------------------------------------------------------------------===//
+#include "IceCfg.h"
+#include "IceInst.h"
+#include "IceOperand.h"
+#include "IceRegAlloc.h"
+#include "IceTargetLowering.h"
+namespace Ice {
+// Implements the linear-scan algorithm.  Based on "Linear Scan
+// Register Allocation in the Context of SSA Form and Register
+// Constraints" by Hanspeter Mössenböck and Michael Pfeiffer,
+// ftp://ftp.ssw.uni-linz.ac.at/pub/Papers/Moe02.PDF .  This
+// implementation is modified to take affinity into account and allow
+// two interfering variables to share the same register in certain
+// cases.
+//
+// Requires running Cfg::liveness(Liveness_RangesFull) in
+// preparation.  Results are assigned to Variable::RegNum for each
+// Variable.
+void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
+  assert(RegMaskFull.any()); // Sanity check
+  Unhandled.clear();
+  Handled.clear();
+  Inactive.clear();
+  Active.clear();
+  Ostream &Str = Func->getContext()->getStrDump();
+  Func->setCurrentNode(NULL);
+  // Gather the live ranges of all variables and add them to the
+  // Unhandled set.  TODO: Unhandled is a set<> which is based on a
+  // balanced binary tree, so inserting live ranges for N variables is
+  // O(N log N) complexity.  N may be proportional to the number of
+  // instructions, thanks to temporary generation during lowering.  As
+  // a result, it may be useful to design a better data structure for
+  // storing Func->getVariables().
+  const VarList &Vars = Func->getVariables();
+  for (VarList::const_iterator I = Vars.begin(), E = Vars.end(); I != E; ++I) {
+    Variable *Var = *I;
+    // Explicitly don't consider zero-weight variables, which are
+    // meant to be spill slots.
+    if (Var->getWeight() == RegWeight::Zero)
+      continue;
+    // Don't bother if the variable has a null live range, which means
+    // it was never referenced.
+    if (Var->getLiveRange().isEmpty())
+      continue;
+    Unhandled.insert(LiveRangeWrapper(Var));
+    if (Var->hasReg()) {
+      Var->setRegNumTmp(Var->getRegNum());
+      Var->setLiveRangeInfiniteWeight();
+    }
+  }
+  // RegUses[I] is the number of live ranges (variables) that register
+  // I is currently assigned to.  It can be greater than 1 as a result
+  // of Variable::AllowRegisterOverlap.
+  std::vector<int> RegUses(RegMaskFull.size());
+  // Unhandled is already set to all ranges in increasing order of
+  // start points.
+  assert(Active.empty());
+  assert(Inactive.empty());
+  assert(Handled.empty());
+  UnorderedRanges::iterator Next;
+  while (!Unhandled.empty()) {
+    LiveRangeWrapper Cur = *Unhandled.begin();
+    Unhandled.erase(Unhandled.begin());
+    if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+      Str << "\nConsidering  ";
+      Cur.dump(Func);
+      Str << "\n";
+    }
+    const llvm::SmallBitVector RegMask =
+        RegMaskFull &
+        Func->getTarget()->getRegisterSetForType(Cur.Var->getType());
+    // Check for precolored ranges.  If Cur is precolored, it
+    // definitely gets that register.  Previously processed live
+    // ranges would have avoided that register due to it being
+    // precolored.  Future processed live ranges won't evict that
+    // register because the live range has infinite weight.
+    if (Cur.Var->hasReg()) {
+      int32_t RegNum = Cur.Var->getRegNum();
+      // RegNumTmp should have already been set above.
+      assert(Cur.Var->getRegNumTmp() == RegNum);
+      if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        Str << "Precoloring  ";
+        Cur.dump(Func);
+        Str << "\n";
+      }
+      Active.push_back(Cur);
+      assert(RegUses[RegNum] >= 0);
+      ++RegUses[RegNum];
+      continue;
+    }
+    // Check for active ranges that have expired or become inactive.
+    for (UnorderedRanges::iterator I = Active.begin(), E = Active.end(); I != E;
+         I = Next) {
+      Next = I;
+      ++Next;
+      LiveRangeWrapper Item = *I;
+      bool Moved = false;
+      if (Item.endsBefore(Cur)) {
+        // Move Item from Active to Handled list.
+        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+          Str << "Expiring     ";
+          Item.dump(Func);
+          Str << "\n";
+        }
+        Active.erase(I);
+        Handled.push_back(Item);
+        Moved = true;
+      } else if (!Item.overlapsStart(Cur)) {
+        // Move Item from Active to Inactive list.
+        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+          Str << "Inactivating ";
+          Item.dump(Func);
+          Str << "\n";
+        }
+        Active.erase(I);
+        Inactive.push_back(Item);
+        Moved = true;
+      }
+      if (Moved) {
+        // Decrement Item from RegUses[].
+        assert(Item.Var->hasRegTmp());
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        --RegUses[RegNum];
+        assert(RegUses[RegNum] >= 0);
+      }
+    }
+    // Check for inactive ranges that have expired or reactivated.
+    for (UnorderedRanges::iterator I = Inactive.begin(), E = Inactive.end();
+         I != E; I = Next) {
+      Next = I;
+      ++Next;
+      LiveRangeWrapper Item = *I;
+      if (Item.endsBefore(Cur)) {
+        // Move Item from Inactive to Handled list.
+        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+          Str << "Expiring     ";
+          Item.dump(Func);
+          Str << "\n";
+        }
+        Inactive.erase(I);
+        Handled.push_back(Item);
+      } else if (Item.overlapsStart(Cur)) {
+        // Move Item from Inactive to Active list.
+        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+          Str << "Reactivating ";
+          Item.dump(Func);
+          Str << "\n";
+        }
+        Inactive.erase(I);
+        Active.push_back(Item);
+        // Increment Item in RegUses[].
+        assert(Item.Var->hasRegTmp());
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        assert(RegUses[RegNum] >= 0);
+        ++RegUses[RegNum];
+      }
+    }
+    // Calculate available registers into Free[].
+    llvm::SmallBitVector Free = RegMask;
+    for (SizeT i = 0; i < RegMask.size(); ++i) {
+      if (RegUses[i] > 0)
+        Free[i] = false;
+    }
+    // Remove registers from the Free[] list where an Inactive range
+    // overlaps with the current range.
+    for (UnorderedRanges::const_iterator I = Inactive.begin(),
+                                         E = Inactive.end();
+         I != E; ++I) {
+      LiveRangeWrapper Item = *I;
+      if (Item.overlaps(Cur)) {
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        // Don't assert(Free[RegNum]) because in theory (though
+        // probably never in practice) there could be two inactive
+        // variables that were allowed marked with
+        // AllowRegisterOverlap.
+        Free[RegNum] = false;
+      }
+    }
+    // Remove registers from the Free[] list where an Unhandled range
+    // overlaps with the current range and is precolored.
+    // Cur.endsBefore(*I) is an early exit check that turns a
+    // guaranteed O(N^2) algorithm into expected linear complexity.
+    for (OrderedRanges::const_iterator I = Unhandled.begin(),
+                                       E = Unhandled.end();
+         I != E && !Cur.endsBefore(*I); ++I) {
+      LiveRangeWrapper Item = *I;
+      if (Item.Var->hasReg() && Item.overlaps(Cur)) {
+        Free[Item.Var->getRegNum()] = false; // Note: getRegNum not getRegNumTmp
+      }
+    }
+    // Print info about physical register availability.
+    if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+      for (SizeT i = 0; i < RegMask.size(); ++i) {
+        if (RegMask[i]) {
+          Str << Func->getTarget()->getRegName(i, IceType_i32)
+              << "(U=" << RegUses[i] << ",F=" << Free[i] << ") ";
+        }
+      }
+      Str << "\n";
+    }
+    Variable *Prefer = Cur.Var->getPreferredRegister();
+    int32_t PreferReg = Prefer && Prefer->hasRegTmp() ? Prefer->getRegNumTmp()
+                                                      : Variable::NoRegister;
+    if (PreferReg != Variable::NoRegister &&
+        (Cur.Var->getRegisterOverlap() || Free[PreferReg])) {
+      // First choice: a preferred register that is either free or is
+      // allowed to overlap with its linked variable.
+      Cur.Var->setRegNumTmp(PreferReg);
+      if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        Str << "Preferring   ";
+        Cur.dump(Func);
+        Str << "\n";
+      }
+      assert(RegUses[PreferReg] >= 0);
+      ++RegUses[PreferReg];
+      Active.push_back(Cur);
+    } else if (Free.any()) {
+      // Second choice: any free register.  TODO: After explicit
+      // affinity is considered, is there a strategy better than just
+      // picking the lowest-numbered available register?
+      int32_t RegNum = Free.find_first();
+      Cur.Var->setRegNumTmp(RegNum);
+      if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        Str << "Allocating   ";
+        Cur.dump(Func);
+        Str << "\n";
+      }
+      assert(RegUses[RegNum] >= 0);
+      ++RegUses[RegNum];
+      Active.push_back(Cur);
+    } else {
+      // Fallback: there are no free registers, so we look for the
+      // lowest-weight register and see if Cur has higher weight.
+      std::vector<RegWeight> Weights(RegMask.size());
+      // Check Active ranges.
+      for (UnorderedRanges::const_iterator I = Active.begin(), E = Active.end();
+           I != E; ++I) {
+        LiveRangeWrapper Item = *I;
+        assert(Item.overlaps(Cur));
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        assert(Item.Var->hasRegTmp());
+        Weights[RegNum].addWeight(Item.range().getWeight());
+      }
+      // Same as above, but check Inactive ranges instead of Active.
+      for (UnorderedRanges::const_iterator I = Inactive.begin(),
+                                           E = Inactive.end();
+           I != E; ++I) {
+        LiveRangeWrapper Item = *I;
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        assert(Item.Var->hasRegTmp());
+        if (Item.overlaps(Cur))
+          Weights[RegNum].addWeight(Item.range().getWeight());
+      }
+      // Check Unhandled ranges that overlap Cur and are precolored.
+      // Cur.endsBefore(*I) is an early exit check that turns a
+      // guaranteed O(N^2) algorithm into expected linear complexity.
+      for (OrderedRanges::const_iterator I = Unhandled.begin(),
+                                         E = Unhandled.end();
+           I != E && !Cur.endsBefore(*I); ++I) {
+        LiveRangeWrapper Item = *I;
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        if (RegNum < 0)
+          continue;
+        if (Item.overlaps(Cur))
+          Weights[RegNum].setWeight(RegWeight::Inf);
+      }
+      // All the weights are now calculated.  Find the register with
+      // smallest weight.
+      int32_t MinWeightIndex = RegMask.find_first();
+      // MinWeightIndex must be valid because of the initial
+      // RegMask.any() test.
+      assert(MinWeightIndex >= 0);
+      for (SizeT i = MinWeightIndex + 1; i < Weights.size(); ++i) {
+        if (RegMask[i] && Weights[i] < Weights[MinWeightIndex])
+          MinWeightIndex = i;
+      }
+      if (Cur.range().getWeight() <= Weights[MinWeightIndex]) {
+        // Cur doesn't have priority over any other live ranges, so
+        // don't allocate any register to it, and move it to the
+        // Handled state.
+        Handled.push_back(Cur);
+        if (Cur.range().getWeight().isInf()) {
+          Func->setError("Unable to find a physical register for an "
+                         "infinite-weight live range");
+        }
+      } else {
+        // Evict all live ranges in Active that register number
+        // MinWeightIndex is assigned to.
+        for (UnorderedRanges::iterator I = Active.begin(), E = Active.end();
+             I != E; I = Next) {
+          Next = I;
+          ++Next;
+          LiveRangeWrapper Item = *I;
+          if (Item.Var->getRegNumTmp() == MinWeightIndex) {
+            if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+              Str << "Evicting     ";
+              Item.dump(Func);
+              Str << "\n";
+            }
+            --RegUses[MinWeightIndex];
+            assert(RegUses[MinWeightIndex] >= 0);
+            Item.Var->setRegNumTmp(Variable::NoRegister);
+            Active.erase(I);
+            Handled.push_back(Item);
+          }
+        }
+        // Do the same for Inactive.
+        for (UnorderedRanges::iterator I = Inactive.begin(), E = Inactive.end();
+             I != E; I = Next) {
+          Next = I;
+          ++Next;
+          LiveRangeWrapper Item = *I;
+          if (Item.Var->getRegNumTmp() == MinWeightIndex) {
+            if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+              Str << "Evicting     ";
+              Item.dump(Func);
+              Str << "\n";
+            }
+            Item.Var->setRegNumTmp(Variable::NoRegister);
+            Inactive.erase(I);
+            Handled.push_back(Item);
+          }
+        }
+        // Assign the register to Cur.
+        Cur.Var->setRegNumTmp(MinWeightIndex);
+        assert(RegUses[MinWeightIndex] >= 0);
+        ++RegUses[MinWeightIndex];
+        Active.push_back(Cur);
+        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+          Str << "Allocating   ";
+          Cur.dump(Func);
+          Str << "\n";
+        }
+      }
+    }
+    dump(Func);
+  }
+  // Move anything Active or Inactive to Handled for easier handling.
+  for (UnorderedRanges::iterator I = Active.begin(), E = Active.end(); I != E;
+       I = Next) {
+    Next = I;
+    ++Next;
+    Handled.push_back(*I);
+    Active.erase(I);
+  }
+  for (UnorderedRanges::iterator I = Inactive.begin(), E = Inactive.end();
+       I != E; I = Next) {
+    Next = I;
+    ++Next;
+    Handled.push_back(*I);
+    Inactive.erase(I);
+  }
+  dump(Func);
+  // Finish up by assigning RegNumTmp->RegNum for each Variable.
+  for (UnorderedRanges::const_iterator I = Handled.begin(), E = Handled.end();
+       I != E; ++I) {
+    LiveRangeWrapper Item = *I;
+    int32_t RegNum = Item.Var->getRegNumTmp();
+    if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+      if (!Item.Var->hasRegTmp()) {
+        Str << "Not assigning ";
+        Item.Var->dump(Func);
+        Str << "\n";
+      } else {
+        Str << (RegNum == Item.Var->getRegNum() ? "Reassigning " : "Assigning ")
+            << Func->getTarget()->getRegName(RegNum, IceType_i32) << "(r"
+            << RegNum << ") to ";
+        Item.Var->dump(Func);
+        Str << "\n";
+      }
+    }
+    Item.Var->setRegNum(Item.Var->getRegNumTmp());
+  }
+  // TODO: Consider running register allocation one more time, with
+  // infinite registers, for two reasons.  First, evicted live ranges
+  // get a second chance for a register.  Second, it allows coalescing
+  // of stack slots.  If there is no time budget for the second
+  // register allocation run, each unallocated variable just gets its
+  // own slot.
+  //
+  // Another idea for coalescing stack slots is to initialize the
+  // Unhandled list with just the unallocated variables, saving time
+  // but not offering second-chance opportunities.
+}
+// ======================== Dump routines ======================== //
+void LiveRangeWrapper::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  const static size_t BufLen = 30;
+  char buf[BufLen];
+  snprintf(buf, BufLen, "%2d", Var->getRegNumTmp());
+  Str << "R=" << buf << "  V=";
+  Var->dump(Func);
+  Str << "  Range=" << range();
+}
+void LinearScan::dump(Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (!Func->getContext()->isVerbose(IceV_LinearScan))
+    return;
+  Func->setCurrentNode(NULL);
+  Str << "**** Current regalloc state:\n";
+  Str << "++++++ Handled:\n";
+  for (UnorderedRanges::const_iterator I = Handled.begin(), E = Handled.end();
+       I != E; ++I) {
+    I->dump(Func);
+    Str << "\n";
+  }
+  Str << "++++++ Unhandled:\n";
+  for (OrderedRanges::const_iterator I = Unhandled.begin(), E = Unhandled.end();
+       I != E; ++I) {
+    I->dump(Func);
+    Str << "\n";
+  }
+  Str << "++++++ Active:\n";
+  for (UnorderedRanges::const_iterator I = Active.begin(), E = Active.end();
+       I != E; ++I) {
+    I->dump(Func);
+    Str << "\n";
+  }
+  Str << "++++++ Inactive:\n";
+  for (UnorderedRanges::const_iterator I = Inactive.begin(), E = Inactive.end();
+       I != E; ++I) {
+    I->dump(Func);
+    Str << "\n";
+  }
+}
+} // end of namespace Ice
--- a/src/IceRegAlloc.h
+++ b/src/IceRegAlloc.h
+//===- subzero/src/IceRegAlloc.h - Linear-scan reg. allocation --*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the data structures used during linear-scan
+// register allocation.  This includes LiveRangeWrapper which
+// encapsulates a variable and its live range, and LinearScan which
+// holds the various work queues for the linear-scan algorithm.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SUBZERO_SRC_ICEREGALLOC_H
+#define SUBZERO_SRC_ICEREGALLOC_H
+#include "IceDefs.h"
+#include "IceTypes.h"
+namespace Ice {
+// Currently this just wraps a Variable pointer, so in principle we
+// could use containers of Variable* instead of LiveRangeWrapper.  But
+// in the future, we may want to do more complex things such as live
+// range splitting, and keeping a wrapper should make that simpler.
+class LiveRangeWrapper {
+public:
+  LiveRangeWrapper(Variable *Var) : Var(Var) {}
+  const LiveRange &range() const { return Var->getLiveRange(); }
+  bool endsBefore(const LiveRangeWrapper &Other) const {
+    return range().endsBefore(Other.range());
+  }
+  bool overlaps(const LiveRangeWrapper &Other) const {
+    return range().overlaps(Other.range());
+  }
+  bool overlapsStart(const LiveRangeWrapper &Other) const {
+    return range().overlaps(Other.range().getStart());
+  }
+  Variable *const Var;
+  void dump(const Cfg *Func) const;
+private:
+  // LiveRangeWrapper(const LiveRangeWrapper &) LLVM_DELETED_FUNCTION;
+  LiveRangeWrapper &operator=(const LiveRangeWrapper &) LLVM_DELETED_FUNCTION;
+};
+class LinearScan {
+public:
+  LinearScan(Cfg *Func) : Func(Func) {}
+  void scan(const llvm::SmallBitVector &RegMask);
+  void dump(Cfg *Func) const;
+private:
+  Cfg *const Func;
+  // RangeCompare is the comparator for sorting an LiveRangeWrapper
+  // by starting point in a std::set<>.  Ties are broken by variable
+  // number so that sorting is stable.
+  struct RangeCompare {
+    bool operator()(const LiveRangeWrapper &L,
+                    const LiveRangeWrapper &R) const {
+      InstNumberT Lstart = L.Var->getLiveRange().getStart();
+      InstNumberT Rstart = R.Var->getLiveRange().getStart();
+      if (Lstart == Rstart)
+        return L.Var->getIndex() < R.Var->getIndex();
+      return Lstart < Rstart;
+    }
+  };
+  typedef std::set<LiveRangeWrapper, RangeCompare> OrderedRanges;
+  typedef std::list<LiveRangeWrapper> UnorderedRanges;
+  OrderedRanges Unhandled;
+  UnorderedRanges Active, Inactive, Handled;
+  LinearScan(const LinearScan &) LLVM_DELETED_FUNCTION;
+  LinearScan &operator=(const LinearScan &) LLVM_DELETED_FUNCTION;
+};
+} // end of namespace Ice
+#endif // SUBZERO_SRC_ICEREGALLOC_H
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -18,6 +18,7 @@
 #include "IceCfg.h" // setError()
 #include "IceCfgNode.h"
 #include "IceOperand.h"
+#include "IceRegAlloc.h"
 #include "IceTargetLowering.h"
 #include "IceTargetLoweringX8632.h"
@@ -66,6 +67,15 @@ TargetLowering *TargetLowering::createLowering(TargetArch Target, Cfg *Func) {
  return NULL;
 }
+void TargetLowering::doAddressOpt() {
+  if (llvm::isa<InstLoad>(*Context.getCur()))
+    doAddressOptLoad();
+  else if (llvm::isa<InstStore>(*Context.getCur()))
+    doAddressOptStore();
+  Context.advanceCur();
+  Context.advanceNext();
+}
 // Lowers a single instruction according to the information in
 // Context, by checking the Context.Cur instruction kind and calling
 // the appropriate lowering method.  The lowering method should insert
@@ -144,4 +154,21 @@ void TargetLowering::lower() {
  Context.advanceNext();
 }
+// Drives register allocation, allowing all physical registers (except
+// perhaps for the frame pointer) to be allocated.  This set of
+// registers could potentially be parameterized if we want to restrict
+// registers e.g. for performance testing.
+void TargetLowering::regAlloc() {
+  LinearScan LinearScan(Func);
+  RegSetMask RegInclude = RegSet_None;
+  RegSetMask RegExclude = RegSet_None;
+  RegInclude |= RegSet_CallerSave;
+  RegInclude |= RegSet_CalleeSave;
+  RegExclude |= RegSet_StackPointer;
+  if (hasFramePointer())
+    RegExclude |= RegSet_FramePointer;
+  llvm::SmallBitVector RegMask = getRegisterSet(RegInclude, RegExclude);
+  LinearScan.scan(RegMask);
+}
 } // end of namespace Ice
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -109,6 +109,8 @@ public:
    Func->setError("Target doesn't specify O2 lowering steps.");
  }
+  // Tries to do address mode optimization on a single instruction.
+  void doAddressOpt();
  // Lowers a single instruction.
  void lower();
@@ -173,6 +175,8 @@ protected:
  virtual void lowerSwitch(const InstSwitch *Inst) = 0;
  virtual void lowerUnreachable(const InstUnreachable *Inst) = 0;
+  virtual void doAddressOptLoad() {}
+  virtual void doAddressOptStore() {}
  // This gives the target an opportunity to post-process the lowered
  // expansion before returning.  The primary intention is to do some
  // Register Manager activity as necessary, specifically to eagerly

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -210,9 +210,97 @@ TargetX8632::TargetX8632(Cfg *Func)
  TypeToRegisterSet[IceType_f64] = FloatRegisters;
 }
+void TargetX8632::translateO2() {
+  GlobalContext *Context = Func->getContext();
+  // Lower Phi instructions.
+  Timer T_placePhiLoads;
+  Func->placePhiLoads();
+  if (Func->hasError())
+    return;
+  T_placePhiLoads.printElapsedUs(Context, "placePhiLoads()");
+  Timer T_placePhiStores;
+  Func->placePhiStores();
+  if (Func->hasError())
+    return;
+  T_placePhiStores.printElapsedUs(Context, "placePhiStores()");
+  Timer T_deletePhis;
+  Func->deletePhis();
+  if (Func->hasError())
+    return;
+  T_deletePhis.printElapsedUs(Context, "deletePhis()");
+  Func->dump("After Phi lowering");
+  // Address mode optimization.
+  Timer T_doAddressOpt;
+  Func->doAddressOpt();
+  T_doAddressOpt.printElapsedUs(Context, "doAddressOpt()");
+  // Target lowering.  This requires liveness analysis for some parts
+  // of the lowering decisions, such as compare/branch fusing.  If
+  // non-lightweight liveness analysis is used, the instructions need
+  // to be renumbered first.  TODO: This renumbering should only be
+  // necessary if we're actually calculating live intervals, which we
+  // only do for register allocation.
+  Timer T_renumber1;
+  Func->renumberInstructions();
+  if (Func->hasError())
+    return;
+  T_renumber1.printElapsedUs(Context, "renumberInstructions()");
+  // TODO: It should be sufficient to use the fastest liveness
+  // calculation, i.e. livenessLightweight().  However, for some
+  // reason that slows down the rest of the translation.  Investigate.
+  Timer T_liveness1;
+  Func->liveness(Liveness_Basic);
+  if (Func->hasError())
+    return;
+  T_liveness1.printElapsedUs(Context, "liveness()");
+  Func->dump("After x86 address mode opt");
+  Timer T_genCode;
+  Func->genCode();
+  if (Func->hasError())
+    return;
+  T_genCode.printElapsedUs(Context, "genCode()");
+  // Register allocation.  This requires instruction renumbering and
+  // full liveness analysis.
+  Timer T_renumber2;
+  Func->renumberInstructions();
+  if (Func->hasError())
+    return;
+  T_renumber2.printElapsedUs(Context, "renumberInstructions()");
+  Timer T_liveness2;
+  Func->liveness(Liveness_Intervals);
+  if (Func->hasError())
+    return;
+  T_liveness2.printElapsedUs(Context, "liveness()");
+  // Validate the live range computations.  Do it outside the timing
+  // code.  TODO: Put this under a flag.
+  bool ValidLiveness = Func->validateLiveness();
+  assert(ValidLiveness);
+  (void)ValidLiveness; // used only in assert()
+  ComputedLiveRanges = true;
+  // The post-codegen dump is done here, after liveness analysis and
+  // associated cleanup, to make the dump cleaner and more useful.
+  Func->dump("After initial x8632 codegen");
+  Timer T_regAlloc;
+  regAlloc();
+  if (Func->hasError())
+    return;
+  T_regAlloc.printElapsedUs(Context, "regAlloc()");
+  Func->dump("After linear scan regalloc");
+  // Stack frame mapping.
+  Timer T_genFrame;
+  Func->genFrame();
+  if (Func->hasError())
+    return;
+  T_genFrame.printElapsedUs(Context, "genFrame()");
+  Func->dump("After stack frame mapping");
+}
 void TargetX8632::translateOm1() {
  GlobalContext *Context = Func->getContext();
-  Ostream &Str = Context->getStrDump();
  Timer T_placePhiLoads;
  Func->placePhiLoads();
  if (Func->hasError())
@@ -228,30 +316,21 @@ void TargetX8632::translateOm1() {
  if (Func->hasError())
    return;
  T_deletePhis.printElapsedUs(Context, "deletePhis()");
-  if (Context->isVerbose()) {
+  Func->dump("After Phi lowering");
-    Str << "================ After Phi lowering ================\n";
-    Func->dump();
-  }
  Timer T_genCode;
  Func->genCode();
  if (Func->hasError())
    return;
  T_genCode.printElapsedUs(Context, "genCode()");
-  if (Context->isVerbose()) {
+  Func->dump("After initial x8632 codegen");
-    Str << "================ After initial x8632 codegen ================\n";
-    Func->dump();
-  }
  Timer T_genFrame;
  Func->genFrame();
  if (Func->hasError())
    return;
  T_genFrame.printElapsedUs(Context, "genFrame()");
-  if (Context->isVerbose()) {
+  Func->dump("After stack frame mapping");
-    Str << "================ After stack frame mapping ================\n";
-    Func->dump();
-  }
 }
 IceString TargetX8632::RegNames[] = {
@@ -327,8 +406,8 @@ void TargetX8632::emitVariable(const Variable *Var, const Cfg *Func) const {
 // calls itself recursively on the components, taking care to handle
 // Lo first because of the little-endian architecture.
 void TargetX8632::setArgOffsetAndCopy(Variable *Arg, Variable *FramePtr,
-                                      int32_t BasicFrameOffset,
+                                      size_t BasicFrameOffset,
-                                      int32_t &InArgsSizeBytes) {
+                                      size_t &InArgsSizeBytes) {
  Variable *Lo = Arg->getLo();
  Variable *Hi = Arg->getHi();
  Type Ty = Arg->getType();
@@ -359,9 +438,9 @@ void TargetX8632::addProlog(CfgNode *Node) {
  // block 1 and C is local to block 2, then C may share a slot with A
  // or B.
  const bool SimpleCoalescing = true;
-  int32_t InArgsSizeBytes = 0;
+  size_t InArgsSizeBytes = 0;
-  int32_t RetIpSizeBytes = 4;
+  size_t RetIpSizeBytes = 4;
-  int32_t PreservedRegsSizeBytes = 0;
+  size_t PreservedRegsSizeBytes = 0;
  LocalsSizeBytes = 0;
  Context.init(Node);
  Context.setInsertPoint(Context.getCur());
@@ -380,8 +459,8 @@ void TargetX8632::addProlog(CfgNode *Node) {
  llvm::SmallBitVector CalleeSaves =
      getRegisterSet(RegSet_CalleeSave, RegSet_None);
-  int32_t GlobalsSize = 0;
+  size_t GlobalsSize = 0;
-  std::vector<int> LocalsSize(Func->getNumNodes());
+  std::vector<size_t> LocalsSize(Func->getNumNodes());
  // Prepass.  Compute RegsUsed, PreservedRegsSizeBytes, and
  // LocalsSizeBytes.
@@ -398,6 +477,9 @@ void TargetX8632::addProlog(CfgNode *Node) {
    // An argument passed on the stack already has a stack slot.
    if (Var->getIsArg())
      continue;
+    // An unreferenced variable doesn't need a stack slot.
+    if (ComputedLiveRanges && Var->getLiveRange().isEmpty())
+      continue;
    // A spill slot linked to a variable with a stack slot should reuse
    // that stack slot.
    if (Var->getWeight() == RegWeight::Zero && Var->getRegisterOverlap()) {
@@ -406,7 +488,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
          continue;
      }
    }
-    int32_t Increment = typeWidthInBytesOnStack(Var->getType());
+    size_t Increment = typeWidthInBytesOnStack(Var->getType());
    if (SimpleCoalescing) {
      if (Var->isMultiblockLife()) {
        GlobalsSize += Increment;
@@ -461,7 +543,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
  // and if they have no home register, home space will need to be
  // allocated on the stack to copy into.
  Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
-  int32_t BasicFrameOffset = PreservedRegsSizeBytes + RetIpSizeBytes;
+  size_t BasicFrameOffset = PreservedRegsSizeBytes + RetIpSizeBytes;
  if (!IsEbpBasedFrame)
    BasicFrameOffset += LocalsSizeBytes;
  for (SizeT i = 0; i < Args.size(); ++i) {
@@ -470,10 +552,10 @@ void TargetX8632::addProlog(CfgNode *Node) {
  }
  // Fill in stack offsets for locals.
-  int32_t TotalGlobalsSize = GlobalsSize;
+  size_t TotalGlobalsSize = GlobalsSize;
  GlobalsSize = 0;
  LocalsSize.assign(LocalsSize.size(), 0);
-  int32_t NextStackOffset = 0;
+  size_t NextStackOffset = 0;
  for (VarList::const_iterator I = Variables.begin(), E = Variables.end();
       I != E; ++I) {
    Variable *Var = *I;
@@ -483,6 +565,8 @@ void TargetX8632::addProlog(CfgNode *Node) {
    }
    if (Var->getIsArg())
      continue;
+    if (ComputedLiveRanges && Var->getLiveRange().isEmpty())
+      continue;
    if (Var->getWeight() == RegWeight::Zero && Var->getRegisterOverlap()) {
      if (Variable *Linked = Var->getPreferredRegister()) {
        if (!Linked->hasReg()) {
@@ -493,7 +577,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
        }
      }
    }
-    int32_t Increment = typeWidthInBytesOnStack(Var->getType());
+    size_t Increment = typeWidthInBytesOnStack(Var->getType());
    if (SimpleCoalescing) {
      if (Var->isMultiblockLife()) {
        GlobalsSize += Increment;
@@ -1601,6 +1685,37 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
  Operand *Src1 = legalize(Inst->getSrc(1));
  Variable *Dest = Inst->getDest();
+  // If Src1 is an immediate, or known to be a physical register, we can
+  // allow Src0 to be a memory operand.  Otherwise, Src0 must be copied into
+  // a physical register.  (Actually, either Src0 or Src1 can be chosen for
+  // the physical register, but unfortunately we have to commit to one or
+  // the other before register allocation.)
+  bool IsSrc1ImmOrReg = false;
+  if (llvm::isa<Constant>(Src1)) {
+    IsSrc1ImmOrReg = true;
+  } else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
+    if (Var->hasReg())
+      IsSrc1ImmOrReg = true;
+  }
+  // Try to fuse a compare immediately followed by a conditional branch.  This
+  // is possible when the compare dest and the branch source operands are the
+  // same, and are their only uses.  TODO: implement this optimization for i64.
+  if (InstBr *NextBr = llvm::dyn_cast_or_null<InstBr>(Context.getNextInst())) {
+    if (Src0->getType() != IceType_i64 && !NextBr->isUnconditional() &&
+        Dest == NextBr->getSrc(0) && NextBr->isLastUse(Dest)) {
+      Operand *Src0New =
+          legalize(Src0, IsSrc1ImmOrReg ? Legal_All : Legal_Reg, true);
+      _cmp(Src0New, Src1);
+      _br(getIcmp32Mapping(Inst->getCondition()), NextBr->getTargetTrue(),
+          NextBr->getTargetFalse());
+      // Skip over the following branch instruction.
+      NextBr->setDeleted();
+      Context.advanceNext();
+      return;
+    }
+  }
  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
  Constant *Zero = Ctx->getConstantInt(IceType_i32, 0);
  Constant *One = Ctx->getConstantInt(IceType_i32, 1);
@@ -1637,19 +1752,6 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
    return;
  }
-  // If Src1 is an immediate, or known to be a physical register, we can
-  // allow Src0 to be a memory operand.  Otherwise, Src0 must be copied into
-  // a physical register.  (Actually, either Src0 or Src1 can be chosen for
-  // the physical register, but unfortunately we have to commit to one or
-  // the other before register allocation.)
-  bool IsSrc1ImmOrReg = false;
-  if (llvm::isa<Constant>(Src1)) {
-    IsSrc1ImmOrReg = true;
-  } else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
-    if (Var->hasReg())
-      IsSrc1ImmOrReg = true;
-  }
  // cmp b, c
  Operand *Src0New =
      legalize(Src0, IsSrc1ImmOrReg ? Legal_All : Legal_Reg, true);
@@ -1662,6 +1764,135 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
  Context.insert(Label);
 }
+namespace {
+bool isAdd(const Inst *Inst) {
+  if (const InstArithmetic *Arith =
+          llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {
+    return (Arith->getOp() == InstArithmetic::Add);
+  }
+  return false;
+}
+void computeAddressOpt(Variable *&Base, Variable *&Index, int32_t &Shift,
+                       int32_t &Offset) {
+  (void)Offset; // TODO: pattern-match for non-zero offsets.
+  if (Base == NULL)
+    return;
+  // If the Base has more than one use or is live across multiple
+  // blocks, then don't go further.  Alternatively (?), never consider
+  // a transformation that would change a variable that is currently
+  // *not* live across basic block boundaries into one that *is*.
+  if (Base->isMultiblockLife() /* || Base->getUseCount() > 1*/)
+    return;
+  while (true) {
+    // Base is Base=Var ==>
+    //   set Base=Var
+    const Inst *BaseInst = Base->getDefinition();
+    Operand *BaseOperand0 = BaseInst ? BaseInst->getSrc(0) : NULL;
+    Variable *BaseVariable0 = llvm::dyn_cast_or_null<Variable>(BaseOperand0);
+    // TODO: Helper function for all instances of assignment
+    // transitivity.
+    if (BaseInst && llvm::isa<InstAssign>(BaseInst) && BaseVariable0 &&
+        // TODO: ensure BaseVariable0 stays single-BB
+        true) {
+      Base = BaseVariable0;
+      continue;
+    }
+    // Index is Index=Var ==>
+    //   set Index=Var
+    // Index==NULL && Base is Base=Var1+Var2 ==>
+    //   set Base=Var1, Index=Var2, Shift=0
+    Operand *BaseOperand1 =
+        BaseInst && BaseInst->getSrcSize() >= 2 ? BaseInst->getSrc(1) : NULL;
+    Variable *BaseVariable1 = llvm::dyn_cast_or_null<Variable>(BaseOperand1);
+    if (Index == NULL && isAdd(BaseInst) && BaseVariable0 && BaseVariable1 &&
+        // TODO: ensure BaseVariable0 and BaseVariable1 stay single-BB
+        true) {
+      Base = BaseVariable0;
+      Index = BaseVariable1;
+      Shift = 0; // should already have been 0
+      continue;
+    }
+    // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
+    //   Index=Var, Shift+=log2(Const)
+    const Inst *IndexInst = Index ? Index->getDefinition() : NULL;
+    if (const InstArithmetic *ArithInst =
+            llvm::dyn_cast_or_null<InstArithmetic>(IndexInst)) {
+      Operand *IndexOperand0 = ArithInst->getSrc(0);
+      Variable *IndexVariable0 = llvm::dyn_cast<Variable>(IndexOperand0);
+      Operand *IndexOperand1 = ArithInst->getSrc(1);
+      ConstantInteger *IndexConstant1 =
+          llvm::dyn_cast<ConstantInteger>(IndexOperand1);
+      if (ArithInst->getOp() == InstArithmetic::Mul && IndexVariable0 &&
+          IndexOperand1->getType() == IceType_i32 && IndexConstant1) {
+        uint64_t Mult = IndexConstant1->getValue();
+        uint32_t LogMult;
+        switch (Mult) {
+        case 1:
+          LogMult = 0;
+          break;
+        case 2:
+          LogMult = 1;
+          break;
+        case 4:
+          LogMult = 2;
+          break;
+        case 8:
+          LogMult = 3;
+          break;
+        default:
+          LogMult = 4;
+          break;
+        }
+        if (Shift + LogMult <= 3) {
+          Index = IndexVariable0;
+          Shift += LogMult;
+          continue;
+        }
+      }
+    }
+    // Index is Index=Var<<Const && Const+Shift<=3 ==>
+    //   Index=Var, Shift+=Const
+    // Index is Index=Const*Var && log2(Const)+Shift<=3 ==>
+    //   Index=Var, Shift+=log2(Const)
+    // Index && Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
+    //   swap(Index,Base)
+    // Similar for Base=Const*Var and Base=Var<<Const
+    // Base is Base=Var+Const ==>
+    //   set Base=Var, Offset+=Const
+    // Base is Base=Const+Var ==>
+    //   set Base=Var, Offset+=Const
+    // Base is Base=Var-Const ==>
+    //   set Base=Var, Offset-=Const
+    // Index is Index=Var+Const ==>
+    //   set Index=Var, Offset+=(Const<<Shift)
+    // Index is Index=Const+Var ==>
+    //   set Index=Var, Offset+=(Const<<Shift)
+    // Index is Index=Var-Const ==>
+    //   set Index=Var, Offset-=(Const<<Shift)
+    // TODO: consider overflow issues with respect to Offset.
+    // TODO: handle symbolic constants.
+    break;
+  }
+}
+} // anonymous namespace
 void TargetX8632::lowerLoad(const InstLoad *Inst) {
  // A Load instruction can be treated the same as an Assign
  // instruction, after the source operand is transformed into an
@@ -1679,10 +1910,64 @@ void TargetX8632::lowerLoad(const InstLoad *Inst) {
    Src0 = OperandX8632Mem::create(Func, Ty, Base, Offset);
  }
+  // Fuse this load with a subsequent Arithmetic instruction in the
+  // following situations:
+  //   a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b
+  //   a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
+  //
+  // TODO: Clean up and test thoroughly.
+  //
+  // TODO: Why limit to Arithmetic instructions?  This could probably be
+  // applied to most any instruction type.  Look at all source operands
+  // in the following instruction, and if there is one instance of the
+  // load instruction's dest variable, and that instruction ends that
+  // variable's live range, then make the substitution.  Deal with
+  // commutativity optimization in the arithmetic instruction lowering.
+  InstArithmetic *NewArith = NULL;
+  if (InstArithmetic *Arith =
+          llvm::dyn_cast_or_null<InstArithmetic>(Context.getNextInst())) {
+    Variable *DestLoad = Inst->getDest();
+    Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
+    Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
+    if (Src1Arith == DestLoad && Arith->isLastUse(Src1Arith) &&
+        DestLoad != Src0Arith) {
+      NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(),
+                                        Arith->getSrc(0), Src0);
+    } else if (Src0Arith == DestLoad && Arith->isCommutative() &&
+               Arith->isLastUse(Src0Arith) && DestLoad != Src1Arith) {
+      NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(),
+                                        Arith->getSrc(1), Src0);
+    }
+    if (NewArith) {
+      Arith->setDeleted();
+      Context.advanceNext();
+      lowerArithmetic(NewArith);
+      return;
+    }
+  }
  InstAssign *Assign = InstAssign::create(Func, Inst->getDest(), Src0);
  lowerAssign(Assign);
 }
+void TargetX8632::doAddressOptLoad() {
+  Inst *Inst = *Context.getCur();
+  Variable *Dest = Inst->getDest();
+  Operand *Addr = Inst->getSrc(0);
+  Variable *Index = NULL;
+  int32_t Shift = 0;
+  int32_t Offset = 0; // TODO: make Constant
+  Variable *Base = llvm::dyn_cast<Variable>(Addr);
+  computeAddressOpt(Base, Index, Shift, Offset);
+  if (Base && Addr != Base) {
+    Constant *OffsetOp = Ctx->getConstantInt(IceType_i32, Offset);
+    Addr = OperandX8632Mem::create(Func, Dest->getType(), Base, OffsetOp, Index,
+                                   Shift);
+    Inst->setDeleted();
+    Context.insert(InstLoad::create(Func, Dest, Addr));
+  }
+}
 void TargetX8632::lowerPhi(const InstPhi * /*Inst*/) {
  Func->setError("Phi found in regular instruction list");
 }
@@ -1781,6 +2066,24 @@ void TargetX8632::lowerStore(const InstStore *Inst) {
  }
 }
+void TargetX8632::doAddressOptStore() {
+  InstStore *Inst = llvm::cast<InstStore>(*Context.getCur());
+  Operand *Data = Inst->getData();
+  Operand *Addr = Inst->getAddr();
+  Variable *Index = NULL;
+  int32_t Shift = 0;
+  int32_t Offset = 0; // TODO: make Constant
+  Variable *Base = llvm::dyn_cast<Variable>(Addr);
+  computeAddressOpt(Base, Index, Shift, Offset);
+  if (Base && Addr != Base) {
+    Constant *OffsetOp = Ctx->getConstantInt(IceType_i32, Offset);
+    Addr = OperandX8632Mem::create(Func, Data->getType(), Base, OffsetOp, Index,
+                                   Shift);
+    Inst->setDeleted();
+    Context.insert(InstStore::create(Func, Data, Addr));
+  }
+}
 void TargetX8632::lowerSwitch(const InstSwitch *Inst) {
  // This implements the most naive possible lowering.
  // cmp a,val[0]; jeq label[0]; cmp a,val[1]; jeq label[1]; ... jmp default
@@ -1904,11 +2207,10 @@ void TargetX8632::postLower() {
      continue;
    if (llvm::isa<InstFakeKill>(Inst))
      continue;
-    SizeT VarIndex = 0;
    for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) {
      Operand *Src = Inst->getSrc(SrcNum);
      SizeT NumVars = Src->getNumVars();
-      for (SizeT J = 0; J < NumVars; ++J, ++VarIndex) {
+      for (SizeT J = 0; J < NumVars; ++J) {
        const Variable *Var = Src->getVar(J);
        if (!Var->hasReg())
          continue;
@@ -1923,11 +2225,10 @@ void TargetX8632::postLower() {
    const Inst *Inst = *I;
    if (Inst->isDeleted())
      continue;
-    SizeT VarIndex = 0;
    for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) {
      Operand *Src = Inst->getSrc(SrcNum);
      SizeT NumVars = Src->getNumVars();
-      for (SizeT J = 0; J < NumVars; ++J, ++VarIndex) {
+      for (SizeT J = 0; J < NumVars; ++J) {
        Variable *Var = Src->getVar(J);
        if (Var->hasReg())
          continue;
@@ -1952,15 +2253,15 @@ void TargetX8632::postLower() {
  }
 }
-template <> void ConstantFloat::emit(const Cfg *Func) const {
+template <> void ConstantFloat::emit(GlobalContext *Ctx) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
+  Ostream &Str = Ctx->getStrEmit();
  // It would be better to prefix with ".L$" instead of "L$", but
  // llvm-mc doesn't parse "dword ptr [.L$foo]".
  Str << "dword ptr [L$" << IceType_f32 << "$" << getPoolEntryID() << "]";
 }
-template <> void ConstantDouble::emit(const Cfg *Func) const {
+template <> void ConstantDouble::emit(GlobalContext *Ctx) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
+  Ostream &Str = Ctx->getStrEmit();
  Str << "qword ptr [L$" << IceType_f64 << "$" << getPoolEntryID() << "]";
 }

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -27,6 +27,7 @@ public:
  static TargetX8632 *create(Cfg *Func) { return new TargetX8632(Func); }
  virtual void translateOm1();
+  virtual void translateO2();
  virtual Variable *getPhysicalRegister(SizeT RegNum);
  virtual IceString getRegName(SizeT RegNum, Type Ty) const;
@@ -56,7 +57,7 @@ public:
  // latter could be done by directly writing to the stack).
  void split64(Variable *Var);
  void setArgOffsetAndCopy(Variable *Arg, Variable *FramePtr,
-                           int32_t BasicFrameOffset, int32_t &InArgsSizeBytes);
+                           size_t BasicFrameOffset, size_t &InArgsSizeBytes);
  Operand *loOperand(Operand *Operand);
  Operand *hiOperand(Operand *Operand);
@@ -89,6 +90,8 @@ protected:
  virtual void lowerStore(const InstStore *Inst);
  virtual void lowerSwitch(const InstSwitch *Inst);
  virtual void lowerUnreachable(const InstUnreachable *Inst);
+  virtual void doAddressOptLoad();
+  virtual void doAddressOptStore();
  // Operand legalization helpers.  To deal with address mode
  // constraints, the helpers will create a new Operand and emit
@@ -248,8 +251,8 @@ protected:
  }
  bool IsEbpBasedFrame;
-  int32_t FrameSizeLocals;
+  size_t FrameSizeLocals;
-  int32_t LocalsSizeBytes;
+  size_t LocalsSizeBytes;
  llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
  llvm::SmallBitVector ScratchRegs;
  llvm::SmallBitVector RegsUsed;
@@ -265,8 +268,8 @@ private:
  template <typename T> void emitConstantPool() const;
 };
-template <> void ConstantFloat::emit(const Cfg *Func) const;
+template <> void ConstantFloat::emit(GlobalContext *Ctx) const;
-template <> void ConstantDouble::emit(const Cfg *Func) const;
+template <> void ConstantDouble::emit(GlobalContext *Ctx) const;
 } // end of namespace Ice

--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -100,6 +100,29 @@ public:
    return Func;
  }
+  // convertConstant() does not use Func or require it to be a valid
+  // Ice::Cfg pointer.  As such, it's suitable for e.g. constructing
+  // global initializers.
+  Ice::Constant *convertConstant(const Constant *Const) {
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(Const)) {
+      return Ctx->getConstantSym(convertType(GV->getType()), 0, GV->getName());
+    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(Const)) {
+      return Ctx->getConstantInt(convertIntegerType(CI->getType()),
+                                 CI->getZExtValue());
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Const)) {
+      Ice::Type Type = convertType(CFP->getType());
+      if (Type == Ice::IceType_f32)
+        return Ctx->getConstantFloat(CFP->getValueAPF().convertToFloat());
+      else if (Type == Ice::IceType_f64)
+        return Ctx->getConstantDouble(CFP->getValueAPF().convertToDouble());
+      llvm_unreachable("Unexpected floating point type");
+      return NULL;
+    } else {
+      llvm_unreachable("Unhandled constant type");
+      return NULL;
+    }
+  }
 private:
  // LLVM values (instructions, etc.) are mapped directly to ICE variables.
  // mapValueToIceVar has a version that forces an ICE type on the variable,
@@ -180,24 +203,7 @@ private:
  Ice::Operand *convertValue(const Value *Op) {
    if (const Constant *Const = dyn_cast<Constant>(Op)) {
-      if (const GlobalValue *GV = dyn_cast<GlobalValue>(Const)) {
+      return convertConstant(Const);
-        return Ctx->getConstantSym(convertType(GV->getType()), 0,
-                                   GV->getName());
-      } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(Const)) {
-        return Ctx->getConstantInt(convertIntegerType(CI->getType()),
-                                   CI->getZExtValue());
-      } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Const)) {
-        Ice::Type Type = convertType(CFP->getType());
-        if (Type == Ice::IceType_f32)
-          return Ctx->getConstantFloat(CFP->getValueAPF().convertToFloat());
-        else if (Type == Ice::IceType_f64)
-          return Ctx->getConstantDouble(CFP->getValueAPF().convertToDouble());
-        llvm_unreachable("Unexpected floating point type");
-        return NULL;
-      } else {
-        llvm_unreachable("Unhandled constant type");
-        return NULL;
-      }
    } else {
      return mapValueToIceVar(Op);
    }

--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -2,7 +2,7 @@
 ; particular the patterns for lowering i64 operations into constituent
 ; i32 operations on x86-32.
-; RUIN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s

--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
 ; This is a basic test of the alloca instruction - one test for alloca
 ; of a fixed size, and one test for variable size.
-; RUIN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s

--- a/tests_lit/llvm2ice_tests/bitcast.ll
+++ b/tests_lit/llvm2ice_tests/bitcast.ll
 ; Trivial smoke test of bitcast between integer and FP types.
-; RUN: %llvm2ice --verbose inst %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
 ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
@@ -9,30 +9,38 @@
 define internal i32 @cast_f2i(float %f) {
 entry:
  %v0 = bitcast float %f to i32
-; CHECK: bitcast
  ret i32 %v0
 }
+; CHECK: mov eax
+; CHECK: ret
 define internal float @cast_i2f(i32 %i) {
 entry:
  %v0 = bitcast i32 %i to float
-; CHECK: bitcast
  ret float %v0
 }
+; CHECK: fld dword ptr
+; CHECK: ret
 define internal i64 @cast_d2ll(double %d) {
 entry:
  %v0 = bitcast double %d to i64
-; CHECK: bitcast
  ret i64 %v0
 }
+; CHECK: mov edx
+; CHECK: ret
 define internal double @cast_ll2d(i64 %ll) {
 entry:
  %v0 = bitcast i64 %ll to double
-; CHECK: bitcast
  ret double %v0
 }
+; CHECK: fld qword ptr
+; CHECK: ret
 ; ERRORS-NOT: ICE translation error
 ; DUMP-NOT: SZ
--- a/tests_lit/llvm2ice_tests/callindirect.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/callindirect.pnacl.ll
@@ -2,7 +2,7 @@
 ; should be to the same operand, whether it's in a register or on the
 ; stack.
-; RUIN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s

--- a/tests_lit/llvm2ice_tests/casts.ll
+++ b/tests_lit/llvm2ice_tests/casts.ll
-; RUIN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice --verbose inst %s | FileCheck %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
 ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \

--- a/tests_lit/llvm2ice_tests/cmp-opt.ll
+++ b/tests_lit/llvm2ice_tests/cmp-opt.ll
 ; Simple test of non-fused compare/branch.
-; RUIN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s

--- a/tests_lit/llvm2ice_tests/convert.ll
+++ b/tests_lit/llvm2ice_tests/convert.ll
 ; Simple test of signed and unsigned integer conversions.
-; RUIN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
-; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
 ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
@@ -32,25 +32,14 @@ entry:
  ret void
 }
 ; CHECK: from_int8:
-; CHECK: mov al, byte ptr [
+; CHECK: mov {{.*}}, byte ptr [
-; CHECK-NEXT: movsx cx, al
+; CHECK: movsx
-; CHECK-NEXT: mov word ptr [
+; CHECK: mov word ptr [
-; CHECK-NEXT: movsx ecx, al
+; CHECK: movsx
-; CHECK-NEXT: mov dword ptr [
+; CHECK: mov dword ptr [
-; CHECK-NEXT: movsx ecx, al
+; CHECK: movsx
-; CHECK-NEXT: sar eax, 31
+; CHECK: sar {{.*}}, 31
-; CHECK-NEXT: mov dword ptr [i64v+4],
+; CHECK: i64v
-; CHECK-NEXT: mov dword ptr [i64v],
-;
-; OPTM1: from_int8:
-; OPTM1: mov {{.*}}, byte ptr [
-; OPTM1: movsx
-; OPTM1: mov word ptr [
-; OPTM1: movsx
-; OPTM1: mov dword ptr [
-; OPTM1: movsx
-; OPTM1: sar {{.*}}, 31
-; OPTM1: i64v
 define void @from_int16() {
 entry:
@@ -68,24 +57,13 @@ entry:
  ret void
 }
 ; CHECK: from_int16:
-; CHECK: mov ax, word ptr [
+; CHECK: mov {{.*}}, word ptr [
-; CHECK-NEXT: mov cx, ax
+; CHECK: i8v
-; CHECK-NEXT: mov byte ptr [
+; CHECK: movsx
-; CHECK-NEXT: movsx ecx, ax
+; CHECK: i32v
-; CHECK-NEXT: mov dword ptr [
+; CHECK: movsx
-; CHECK-NEXT: movsx ecx, ax
+; CHECK: sar {{.*}}, 31
-; CHECK-NEXT: sar eax, 31
+; CHECK: i64v
-; CHECK-NEXT: mov dword ptr [i64v+4],
-; CHECK-NEXT: mov dword ptr [i64v],
-;
-; OPTM1: from_int16:
-; OPTM1: mov {{.*}}, word ptr [
-; OPTM1: i8v
-; OPTM1: movsx
-; OPTM1: i32v
-; OPTM1: movsx
-; OPTM1: sar {{.*}}, 31
-; OPTM1: i64v
 define void @from_int32() {
 entry:
@@ -103,22 +81,11 @@ entry:
  ret void
 }
 ; CHECK: from_int32:
-; CHECK: mov eax, dword ptr [
+; CHECK: i32v
-; CHECK-NEXT: mov ecx, eax
+; CHECK: i8v
-; CHECK-NEXT: mov byte ptr [
+; CHECK: i16v
-; CHECK-NEXT: mov ecx, eax
+; CHECK: sar {{.*}}, 31
-; CHECK-NEXT: mov word ptr [
+; CHECK: i64v
-; CHECK-NEXT: mov ecx, eax
-; CHECK-NEXT: sar eax, 31
-; CHECK-NEXT: mov dword ptr [i64v+4],
-; CHECK-NEXT: mov dword ptr [i64v],
-;
-; OPTM1: from_int32:
-; OPTM1: i32v
-; OPTM1: i8v
-; OPTM1: i16v
-; OPTM1: sar {{.*}}, 31
-; OPTM1: i64v
 define void @from_int64() {
 entry:
@@ -136,18 +103,10 @@ entry:
  ret void
 }
 ; CHECK: from_int64:
-; CHECK: mov eax, dword ptr [
+; CHECK: i64v
-; CHECK-NEXT: mov ecx, eax
+; CHECK: i8v
-; CHECK-NEXT: mov byte ptr [
+; CHECK: i16v
-; CHECK-NEXT: mov ecx, eax
+; CHECK: i32v
-; CHECK-NEXT: mov word ptr [
-; CHECK-NEXT: mov dword ptr [
-;
-; OPTM1: from_int64:
-; OPTM1: i64v
-; OPTM1: i8v
-; OPTM1: i16v
-; OPTM1: i32v
 define void @from_uint8() {
 entry:
@@ -165,25 +124,14 @@ entry:
  ret void
 }
 ; CHECK: from_uint8:
-; CHECK: mov al, byte ptr [
+; CHECK: u8v
-; CHECK-NEXT: movzx cx, al
+; CHECK: movzx
-; CHECK-NEXT: mov word ptr [
+; CHECK: i16v
-; CHECK-NEXT: movzx ecx, al
+; CHECK: movzx
-; CHECK-NEXT: mov dword ptr [
+; CHECK: i32v
-; CHECK-NEXT: movzx eax, al
+; CHECK: movzx
-; CHECK-NEXT: mov ecx, 0
+; CHECK: mov {{.*}}, 0
-; CHECK-NEXT: mov dword ptr [i64v+4],
+; CHECK: i64v
-; CHECK-NEXT: mov dword ptr [i64v],
-;
-; OPTM1: from_uint8:
-; OPTM1: u8v
-; OPTM1: movzx
-; OPTM1: i16v
-; OPTM1: movzx
-; OPTM1: i32v
-; OPTM1: movzx
-; OPTM1: mov {{.*}}, 0
-; OPTM1: i64v
 define void @from_uint16() {
 entry:
@@ -201,24 +149,13 @@ entry:
  ret void
 }
 ; CHECK: from_uint16:
-; CHECK: mov ax, word ptr [
+; CHECK: u16v
-; CHECK-NEXT: mov cx, ax
+; CHECK: i8v
-; CHECK-NEXT: mov byte ptr [
+; CHECK: movzx
-; CHECK-NEXT: movzx ecx, ax
+; CHECK: i32v
-; CHECK-NEXT: mov dword ptr [
+; CHECK: movzx
-; CHECK-NEXT: movzx eax, ax
+; CHECK: mov {{.*}}, 0
-; CHECK-NEXT: mov ecx, 0
+; CHECK: i64v
-; CHECK-NEXT: mov dword ptr [i64v+4],
-; CHECK-NEXT: mov dword ptr [i64v],
-;
-; OPTM1: from_uint16:
-; OPTM1: u16v
-; OPTM1: i8v
-; OPTM1: movzx
-; OPTM1: i32v
-; OPTM1: movzx
-; OPTM1: mov {{.*}}, 0
-; OPTM1: i64v
 define void @from_uint32() {
 entry:
@@ -236,21 +173,11 @@ entry:
  ret void
 }
 ; CHECK: from_uint32:
-; CHECK: mov eax, dword ptr [
+; CHECK: u32v
-; CHECK-NEXT: mov ecx, eax
+; CHECK: i8v
-; CHECK-NEXT: mov byte ptr [
+; CHECK: i16v
-; CHECK-NEXT: mov ecx, eax
+; CHECK: mov {{.*}}, 0
-; CHECK-NEXT: mov word ptr [
+; CHECK: i64v
-; CHECK-NEXT: mov ecx, 0
-; CHECK-NEXT: mov dword ptr [i64v+4],
-; CHECK-NEXT: mov dword ptr [i64v],
-;
-; OPTM1: from_uint32:
-; OPTM1: u32v
-; OPTM1: i8v
-; OPTM1: i16v
-; OPTM1: mov {{.*}}, 0
-; OPTM1: i64v
 define void @from_uint64() {
 entry:
@@ -268,18 +195,10 @@ entry:
  ret void
 }
 ; CHECK: from_uint64:
-; CHECK: mov eax, dword ptr [
+; CHECK: u64v
-; CHECK-NEXT: mov ecx, eax
+; CHECK: i8v
-; CHECK-NEXT: mov byte ptr [
+; CHECK: i16v
-; CHECK-NEXT: mov ecx, eax
+; CHECK: i32v
-; CHECK-NEXT: mov word ptr [
-; CHECK-NEXT: mov dword ptr [
-;
-; OPTM1: from_uint64:
-; OPTM1: u64v
-; OPTM1: i8v
-; OPTM1: i16v
-; OPTM1: i32v
 ; ERRORS-NOT: ICE translation error
 ; DUMP-NOT: SZ
--- a/tests_lit/llvm2ice_tests/fp.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/fp.pnacl.ll
@@ -3,7 +3,7 @@
 ; that should be present regardless of the optimization level, so
 ; there are no special OPTM1 match lines.
-; RUIN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s

--- a/tests_lit/llvm2ice_tests/fpconst.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/fpconst.pnacl.ll
@@ -6,6 +6,7 @@
 ; number in a reasonable number of digits".  See
 ; http://llvm.org/docs/LangRef.html#simple-constants .
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s

--- a/tests_lit/llvm2ice_tests/select-opt.ll
+++ b/tests_lit/llvm2ice_tests/select-opt.ll
@@ -3,7 +3,7 @@
 ; regardless of the optimization level, so there are no special OPTM1
 ; match lines.
-; RUIN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s

--- a/tests_lit/llvm2ice_tests/shift.ll
+++ b/tests_lit/llvm2ice_tests/shift.ll
 ; This is a test of C-level conversion operations that clang lowers
 ; into pairs of shifts.
-; RUIN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s

--- a/tests_lit/llvm2ice_tests/simple-loop.ll
+++ b/tests_lit/llvm2ice_tests/simple-loop.ll
 ; This tests a simple loop that sums the elements of an input array.
 ; The O2 check patterns represent the best code currently achieved.
-; RUIN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s

--- a/tests_lit/llvm2ice_tests/unreachable.ll
+++ b/tests_lit/llvm2ice_tests/unreachable.ll
@@ -20,5 +20,11 @@ return:                                           ; preds = %entry
  ret i32 %div
 }
+; CHECK: cmp
+; CHECK: call ice_unreachable
+; CHECK: cdq
+; CHECK: idiv
+; CHECK: ret
 ; ERRORS-NOT: ICE translation error
 ; DUMP-NOT: SZ