Subzero: Make the register allocator more robust with -reg-use and -reg-exclude.

The problem is that if you too aggressively -reg-use or -reg-exclude, you can get failures because of inherently high register pressure, and there are also contributions from the "specialty" register classes. For example, when you combine load optimization, address mode inference, local register availability optimization, and the div instruction, you can end up needing 5 simultaneously live infinite-weight registers. The fix/enhancement here is to keep track of the "reserve" set of registers for each register class, and allow the register allocator to draw from that as a last resort. This behavior is guarded by the -reg-reserve flag. This CL also includes two improvements in lowering sequences to reduce register pressure. BUG= none R=kschimpf@google.com Review URL: https://codereview.chromium.org/1641653004 .

Subzero: Make the register allocator more robust with -reg-use and -reg-exclude.
b40595a1 · Jim Stichnoth · 029bed9c · b40595a1 · b40595a1 · b40595a1
Commit b40595a1 authored Jan 29, 2016 by Jim Stichnoth
17 changed files
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -55,6 +55,30 @@ Cfg::Cfg(GlobalContext *Ctx, uint32_t SequenceNumber)

 Cfg::~Cfg() { assert(ICE_TLS_GET_FIELD(CurrentCfg) == nullptr); }

+/// Create a string like "foo(i=123:b=9)" indicating the function name, number
+/// of high-level instructions, and number of basic blocks.  This string is only
+/// used for dumping and other diagnostics, and the idea is that given a set of
+/// functions to debug a problem on, it's easy to find the smallest or simplest
+/// function to attack.  Note that the counts may change somewhat depending on
+/// what point it is called during the translation passes.
+IceString Cfg::getFunctionNameAndSize() const {
+  if (!BuildDefs::dump())
+    return getFunctionName();
+  SizeT NodeCount = 0;
+  SizeT InstCount = 0;
+  for (CfgNode *Node : getNodes()) {
+    ++NodeCount;
+    // Note: deleted instructions are *not* ignored.
+    InstCount += Node->getPhis().size();
+    for (Inst &I : Node->getInsts()) {
+      if (!llvm::isa<InstTarget>(&I))
+        ++InstCount;
+    }
+  }
+  return getFunctionName() + "(i=" + std::to_string(InstCount) + ":b=" +
+         std::to_string(NodeCount) + ")";
+}
+
 void Cfg::setError(const IceString &Message) {
  HasError = true;
  ErrorMessage = Message;
@@ -1075,7 +1099,9 @@ void Cfg::dump(const IceString &Message) {
      Str << Args[i]->getType() << " ";
      Args[i]->dump(this);
    }
-    Str << ") {\n";
+    // Append an extra copy of the function name here, in order to print its
+    // size stats but not mess up lit tests.
+    Str << ") { # " << getFunctionNameAndSize() << "\n";
  }
  resetCurrentNode();
  if (isVerbose(IceV_Liveness)) {

--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -64,7 +64,8 @@ public:
  /// \name Manage the name and return type of the function being translated.
  /// @{
  void setFunctionName(const IceString &Name) { FunctionName = Name; }
-  IceString getFunctionName() const { return FunctionName; }
+  const IceString &getFunctionName() const { return FunctionName; }
+  IceString getFunctionNameAndSize() const;
  void setReturnType(Type Ty) { ReturnType = Ty; }
  Type getReturnType() const { return ReturnType; }
  /// @}

--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -185,6 +185,13 @@ cl::opt<bool>
                                cl::desc("Randomize register allocation"),
                                cl::init(false));

+/// Allow failsafe access to registers that were restricted via -reg-use or
+/// -reg-exclude.
+cl::opt<bool>
+    RegAllocReserve("reg-reserve",
+                    cl::desc("Let register allocation use reserve registers"),
+                    cl::init(false));
+
 /// Repeat register allocation until convergence.
 cl::opt<bool>
    RepeatRegAlloc("regalloc-repeat",
@@ -545,6 +552,7 @@ void ClFlags::getParsedClFlags(ClFlags &OutFlags) {
  OutFlags.setShouldReorderBasicBlocks(::ReorderBasicBlocks);
  OutFlags.setShouldDoNopInsertion(::ShouldDoNopInsertion);
  OutFlags.setShouldRandomizeRegAlloc(::RandomizeRegisterAllocation);
+  OutFlags.setRegAllocReserve(::RegAllocReserve);
  OutFlags.setShouldRepeatRegAlloc(::RepeatRegAlloc);
  OutFlags.setShouldReorderFunctions(::ReorderFunctions);
  OutFlags.setShouldReorderGlobalVariables(::ReorderGlobalVariables);

--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -169,6 +169,11 @@ public:
  /// Set ClFlags::RandomRegAlloc to a new value
  void setShouldRandomizeRegAlloc(bool NewValue) { RandomRegAlloc = NewValue; }

+  /// Get the value of ClFlags::RegAllocReserve
+  bool getRegAllocReserve() const { return RegAllocReserve; }
+  /// Set ClFlags::RegAllocReserve to a new value
+  void setRegAllocReserve(bool NewValue) { RegAllocReserve = NewValue; }
+
  /// Get the value of ClFlags::RepeatRegAlloc
  bool shouldRepeatRegAlloc() const { return RepeatRegAlloc; }
  /// Set ClFlags::RepeatRegAlloc to a new value
@@ -425,6 +430,8 @@ private:
  bool RandomNopInsertion;
  /// see anonymous_namespace{IceClFlags.cpp}::RandomizeRegisterAllocation
  bool RandomRegAlloc;
+  /// see anonymous_namespace{IceClFlags.cpp}::RegAllocReserve
+  bool RegAllocReserve;
  /// see anonymous_namespace{IceClFlags.cpp}::RepeatRegAlloc
  bool RepeatRegAlloc;
  /// see anonymous_namespace{IceClFlags.cpp}::ReorderBasicBlocks

--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -309,7 +309,8 @@ void GlobalContext::translateFunctions() {
      getErrorStatus()->assign(EC_Translation);
      OstreamLocker L(this);
      getStrError() << "ICE translation error: " << Func->getFunctionName()
-                    << ": " << Func->getError() << "\n";
+                    << ": " << Func->getError() << ": "
+                    << Func->getFunctionNameAndSize() << "\n";
      Item = new EmitterWorkItem(Func->getSequenceNumber());
    } else {
      Func->getAssembler<>()->setInternal(Func->getInternal());
@@ -320,7 +321,7 @@ void GlobalContext::translateFunctions() {
        // The Cfg has already emitted into the assembly buffer, so
        // stats have been fully collected into this thread's TLS.
        // Dump them before TLS is reset for the next Cfg.
-        dumpStats(Func->getFunctionName());
+        dumpStats(Func->getFunctionNameAndSize());
        Assembler *Asm = Func->releaseAssembler();
        // Copy relevant fields into Asm before Func is deleted.
        Asm->setFunctionName(Func->getFunctionName());
@@ -549,7 +550,7 @@ void GlobalContext::emitItems() {
        Cfg::setCurrentCfg(Func.get());
        Func->emit();
        Cfg::setCurrentCfg(nullptr);
-        dumpStats(Func->getFunctionName());
+        dumpStats(Func->getFunctionNameAndSize());
      } break;
      }
    }

--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
--- a/src/IceRegAlloc.h
+++ b/src/IceRegAlloc.h
@@ -61,7 +61,9 @@ private:
    int32_t PreferReg = Variable::NoRegister;
    bool AllowOverlap = false;
    llvm::SmallBitVector RegMask;
+    llvm::SmallBitVector RegMaskUnfiltered;
    llvm::SmallBitVector Free;
+    llvm::SmallBitVector FreeUnfiltered;
    llvm::SmallBitVector PrecoloredUnhandledMask; // Note: only used for dumping
    llvm::SmallVector<RegWeight, REGS_SIZE> Weights;
  };
@@ -98,7 +100,7 @@ private:
  void filterFreeWithPrecoloredRanges(IterationState &Iter);
  void allocatePrecoloredRegister(Variable *Cur);
  void allocatePreferredRegister(IterationState &Iter);
-  void allocateFreeRegister(IterationState &Iter);
+  void allocateFreeRegister(IterationState &Iter, bool Filtered);
  void handleNoFreeRegisters(IterationState &Iter);
  void assignFinalRegisters(const llvm::SmallBitVector &RegMaskFull,
                            const llvm::SmallBitVector &PreDefinedRegisters,
@@ -130,6 +132,7 @@ private:
  bool FindOverlap = false;

  const bool Verbose;
+  const bool UseReserve;
 };

 } // end of namespace Ice

--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -275,8 +275,15 @@ public:

  virtual llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                              RegSetMask Exclude) const = 0;
+  /// Get the set of physical registers available for the specified Variable's
+  /// register class, applying register restrictions from the command line.
  virtual const llvm::SmallBitVector &
  getRegistersForVariable(const Variable *Var) const = 0;
+  /// Get the set of *all* physical registers available for the specified
+  /// Variable's register class, *not* applying register restrictions from the
+  /// command line.
+  virtual const llvm::SmallBitVector &
+  getAllRegistersForVariable(const Variable *Var) const = 0;
  virtual const llvm::SmallBitVector &getAliasesForRegister(SizeT) const = 0;

  void regAlloc(RegAllocKind Kind);

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -342,6 +342,9 @@ void TargetARM32::staticInit(GlobalContext *Ctx) {
  TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
  TypeToRegisterSet[IceType_v4f32] = VectorRegisters;

+  for (size_t i = 0; i < llvm::array_lengthof(TypeToRegisterSet); ++i)
+    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
+
  filterTypeToRegisterSet(
      Ctx, RegARM32::Reg_NUM, TypeToRegisterSet,
      llvm::array_lengthof(TypeToRegisterSet), [](int32_t RegNum) -> IceString {
@@ -6514,6 +6517,8 @@ void TargetHeaderARM32::lower() {
 }

 llvm::SmallBitVector TargetARM32::TypeToRegisterSet[RegARM32::RCARM32_NUM];
+llvm::SmallBitVector
+    TargetARM32::TypeToRegisterSetUnfiltered[RegARM32::RCARM32_NUM];
 llvm::SmallBitVector TargetARM32::RegisterAliases[RegARM32::Reg_NUM];

 } // end of namespace ARM32

--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -88,6 +88,12 @@ public:
    assert(RC < RC_Target);
    return TypeToRegisterSet[RC];
  }
+  const llvm::SmallBitVector &
+  getAllRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(RC < RC_Target);
+    return TypeToRegisterSetUnfiltered[RC];
+  }
  const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
    return RegisterAliases[Reg];
  }
@@ -1020,6 +1026,8 @@ protected:
  uint32_t MaxOutArgsSizeBytes = 0;
  // TODO(jpp): std::array instead of array.
  static llvm::SmallBitVector TypeToRegisterSet[RegARM32::RCARM32_NUM];
+  static llvm::SmallBitVector
+      TypeToRegisterSetUnfiltered[RegARM32::RCARM32_NUM];
  static llvm::SmallBitVector RegisterAliases[RegARM32::Reg_NUM];
  llvm::SmallBitVector RegsUsed;
  VarList PhysicalRegisters[IceType_NUM];

--- a/src/IceTargetLoweringMIPS32.cpp
+++ b/src/IceTargetLoweringMIPS32.cpp
@@ -116,6 +116,9 @@ void TargetMIPS32::staticInit(GlobalContext *Ctx) {
  TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
  TypeToRegisterSet[IceType_v4f32] = VectorRegisters;

+  for (size_t i = 0; i < llvm::array_lengthof(TypeToRegisterSet); ++i)
+    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
+
  filterTypeToRegisterSet(Ctx, RegMIPS32::Reg_NUM, TypeToRegisterSet,
                          llvm::array_lengthof(TypeToRegisterSet),
                          RegMIPS32::getRegName, getRegClassName);
@@ -1126,6 +1129,7 @@ void TargetHeaderMIPS32::lower() {
 }

 llvm::SmallBitVector TargetMIPS32::TypeToRegisterSet[RCMIPS32_NUM];
+llvm::SmallBitVector TargetMIPS32::TypeToRegisterSetUnfiltered[RCMIPS32_NUM];
 llvm::SmallBitVector TargetMIPS32::RegisterAliases[RegMIPS32::Reg_NUM];

 } // end of namespace MIPS32

--- a/src/IceTargetLoweringMIPS32.h
+++ b/src/IceTargetLoweringMIPS32.h
@@ -57,6 +57,12 @@ public:
    assert(RC < RC_Target);
    return TypeToRegisterSet[RC];
  }
+  const llvm::SmallBitVector &
+  getAllRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(RC < RC_Target);
+    return TypeToRegisterSetUnfiltered[RC];
+  }
  const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
    return RegisterAliases[Reg];
  }
@@ -263,6 +269,7 @@ protected:
  bool UsesFramePointer = false;
  bool NeedsStackAlignment = false;
  static llvm::SmallBitVector TypeToRegisterSet[RCMIPS32_NUM];
+  static llvm::SmallBitVector TypeToRegisterSetUnfiltered[RCMIPS32_NUM];
  static llvm::SmallBitVector RegisterAliases[RegMIPS32::Reg_NUM];
  llvm::SmallBitVector RegsUsed;
  VarList PhysicalRegisters[IceType_NUM];

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -107,6 +107,10 @@ std::array<llvm::SmallBitVector, RCX86_NUM>
    TargetX86Base<X8632::Traits>::TypeToRegisterSet = {{}};

 template <>
+std::array<llvm::SmallBitVector, RCX86_NUM>
+    TargetX86Base<X8632::Traits>::TypeToRegisterSetUnfiltered = {{}};
+
+template <>
 std::array<llvm::SmallBitVector,
           TargetX86Base<X8632::Traits>::Traits::RegisterSet::Reg_NUM>
    TargetX86Base<X8632::Traits>::RegisterAliases = {{}};

--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -107,6 +107,10 @@ std::array<llvm::SmallBitVector, RCX86_NUM>
    TargetX86Base<X8664::Traits>::TypeToRegisterSet = {{}};

 template <>
+std::array<llvm::SmallBitVector, RCX86_NUM>
+    TargetX86Base<X8664::Traits>::TypeToRegisterSetUnfiltered = {{}};
+
+template <>
 std::array<llvm::SmallBitVector,
           TargetX86Base<X8664::Traits>::Traits::RegisterSet::Reg_NUM>
    TargetX86Base<X8664::Traits>::RegisterAliases = {{}};

--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -124,6 +124,13 @@ public:
    return TypeToRegisterSet[RC];
  }

+  const llvm::SmallBitVector &
+  getAllRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
+    return TypeToRegisterSetUnfiltered[RC];
+  }
+
  const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
    assert(Reg < Traits::RegisterSet::Reg_NUM);
    return RegisterAliases[Reg];
@@ -974,6 +981,8 @@ protected:
  bool PrologEmitsFixedAllocas = false;
  uint32_t MaxOutArgsSizeBytes = 0;
  static std::array<llvm::SmallBitVector, RCX86_NUM> TypeToRegisterSet;
+  static std::array<llvm::SmallBitVector, RCX86_NUM>
+      TypeToRegisterSetUnfiltered;
  static std::array<llvm::SmallBitVector, Traits::RegisterSet::Reg_NUM>
      RegisterAliases;
  llvm::SmallBitVector RegsUsed;

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -379,6 +379,8 @@ template <typename TraitsType>
 void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) {
  Traits::initRegisterSet(Ctx->getFlags(), &TypeToRegisterSet,
                          &RegisterAliases);
+  for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
+    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
  filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
                          TypeToRegisterSet.data(), TypeToRegisterSet.size(),
                          Traits::getRegName, getRegClassName);
@@ -1945,8 +1947,6 @@ void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Inst) {
      Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
      _mov(T_1, Src0Hi);
      _imul(T_1, Src1Lo);
-      _mov(T_2, Src1Hi);
-      _imul(T_2, Src0Lo);
      _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
      _mul(T_4Lo, T_3, Src1Lo);
      // The mul instruction produces two dest variables, edx:eax. We create a
@@ -1954,6 +1954,8 @@ void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Inst) {
      Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
      _mov(DestLo, T_4Lo);
      _add(T_4Hi, T_1);
+      _mov(T_2, Src1Hi);
+      _imul(T_2, Src0Lo);
      _add(T_4Hi, T_2);
      _mov(DestHi, T_4Hi);
    } break;
@@ -5801,8 +5803,8 @@ void TargetX86Base<TraitsType>::lowerStore(const InstStore *Inst) {
  if (!Traits::Is64Bit && Ty == IceType_i64) {
    Value = legalizeUndef(Value);
    Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
-    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
    _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
+    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
    _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
  } else if (isVectorType(Ty)) {
    _storep(legalizeToReg(Value), NewAddr);

--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -125,7 +125,7 @@ entry:
 ; OPTM1-LABEL: pass64BitConstArg
 ; OPTM1:      sub     esp
 ; OPTM1:      mov     DWORD PTR [esp+0x4]
-; OPTM1-NEXT: mov     DWORD PTR [esp]
+; OPTM1:      mov     DWORD PTR [esp]
 ; OPTM1-NEXT: mov     DWORD PTR [esp+0x8],0x7b
 ; Bundle padding might be added (so not using -NEXT).
 ; OPTM1:      mov     DWORD PTR [esp+0x10],0xdeadbeef
@@ -277,16 +277,16 @@ entry:
 }
 ; CHECK-LABEL: mul64BitSigned
 ; CHECK: imul
-; CHECK: imul
 ; CHECK: mul
 ; CHECK: add
+; CHECK: imul
 ; CHECK: add
 ;
 ; OPTM1-LABEL: mul64BitSigned
 ; OPTM1: imul
-; OPTM1: imul
 ; OPTM1: mul
 ; OPTM1: add
+; OPTM1: imul
 ; OPTM1: add

 ; ARM32-LABEL: mul64BitSigned
@@ -302,16 +302,16 @@ entry:
 }
 ; CHECK-LABEL: mul64BitUnsigned
 ; CHECK: imul
-; CHECK: imul
 ; CHECK: mul
 ; CHECK: add
+; CHECK: imul
 ; CHECK: add
 ;
 ; OPTM1-LABEL: mul64BitUnsigned
 ; OPTM1: imul
-; OPTM1: imul
 ; OPTM1: mul
 ; OPTM1: add
+; OPTM1: imul
 ; OPTM1: add

 ; ARM32-LABEL: mul64BitUnsigned