Subzero: Randomize register assignment.

Randomize the order that registers appear in the free list. Only randomize fully "equivalent" registers to ensure no extra spills. This adds the -randomize-regalloc option. This is a continuation of https://codereview.chromium.org/456033003/ which Matt owns. BUG= none R=jfb@chromium.org Review URL: https://codereview.chromium.org/807293003

Subzero: Randomize register assignment.
e6d24789 · Jim Stichnoth · 4019f084 · e6d24789 · e6d24789 · e6d24789
Commit e6d24789 authored Dec 19, 2014 by Jim Stichnoth
10 changed files
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -122,6 +122,7 @@ enum VerboseItem {
  IceV_LinearScan = 1 << 8,
  IceV_Frame = 1 << 9,
  IceV_AddrOpt = 1 << 10,
+  IceV_Random = 1 << 11,
  IceV_All = ~IceV_None,
  IceV_Most = IceV_All & ~IceV_LinearScan
 };

--- a/src/IceRNG.h
+++ b/src/IceRNG.h
@@ -42,7 +42,7 @@ class RandomNumberGeneratorWrapper {
  operator=(const RandomNumberGeneratorWrapper &) = delete;

 public:
-  uint64_t next(uint64_t Max) { return RNG.next(Max); }
+  uint64_t operator()(uint64_t Max) { return RNG.next(Max); }
  bool getTrueWithProbability(float Probability);
  RandomNumberGeneratorWrapper(RandomNumberGenerator &RNG) : RNG(RNG) {}


--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -141,7 +141,7 @@ void LinearScan::initForGlobal() {
 //
 // Some properties we aren't (yet) taking advantage of:
 //
-// * Because live ranges are a single segment, the Unhandled set will
+// * Because live ranges are a single segment, the Inactive set will
 //   always be empty, and the live range trimming operation is
 //   unnecessary.
 //
@@ -258,7 +258,8 @@ void LinearScan::init(RegAllocKind Kind) {
 // Requires running Cfg::liveness(Liveness_Intervals) in
 // preparation.  Results are assigned to Variable::RegNum for each
 // Variable.
-void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
+void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull,
+                      bool Randomized) {
  TimerMarker T(TimerStack::TT_linearScan, Func);
  assert(RegMaskFull.any()); // Sanity check
  Ostream &Str = Func->getContext()->getStrDump();
@@ -266,6 +267,13 @@ void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
      ALLOW_DUMP && Func->getContext()->isVerbose(IceV_LinearScan);
  Func->resetCurrentNode();
  VariablesMetadata *VMetadata = Func->getVMetadata();
+  const size_t NumRegisters = RegMaskFull.size();
+  llvm::SmallBitVector PreDefinedRegisters(NumRegisters);
+  if (Randomized) {
+    for (Variable *Var : UnhandledPrecolored) {
+      PreDefinedRegisters[Var->getRegNum()] = true;
+    }
+  }

  // Build a LiveRange representing the Kills list.
  LiveRange KillsRange(Kills);
@@ -274,7 +282,7 @@ void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
  // RegUses[I] is the number of live ranges (variables) that register
  // I is currently assigned to.  It can be greater than 1 as a result
  // of AllowOverlap inference below.
-  std::vector<int> RegUses(RegMaskFull.size());
+  std::vector<int> RegUses(NumRegisters);
  // Unhandled is already set to all ranges in increasing order of
  // start points.
  assert(Active.empty());
@@ -662,23 +670,39 @@ void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
  Inactive.clear();
  dump(Func);

-  // Finish up by assigning RegNumTmp->RegNum for each Variable.
+  // TODO(stichnot): Statically choose the size based on the target
+  // being compiled.
+  const size_t REGS_SIZE = 32;
+  llvm::SmallVector<int32_t, REGS_SIZE> Permutation(NumRegisters);
+  if (Randomized) {
+    Func->getTarget()->makeRandomRegisterPermutation(
+        Permutation, PreDefinedRegisters | ~RegMaskFull);
+  }
+
+  // Finish up by assigning RegNumTmp->RegNum (or a random permutation
+  // thereof) for each Variable.
  for (Variable *Item : Handled) {
    int32_t RegNum = Item->getRegNumTmp();
+    int32_t AssignedRegNum = RegNum;
+
+    if (Randomized && Item->hasRegTmp() && !Item->hasReg()) {
+      AssignedRegNum = Permutation[RegNum];
+    }
    if (Verbose) {
      if (!Item->hasRegTmp()) {
        Str << "Not assigning ";
        Item->dump(Func);
        Str << "\n";
      } else {
-        Str << (RegNum == Item->getRegNum() ? "Reassigning " : "Assigning ")
-            << Func->getTarget()->getRegName(RegNum, IceType_i32) << "(r"
-            << RegNum << ") to ";
+        Str << (AssignedRegNum == Item->getRegNum() ? "Reassigning "
+                                                    : "Assigning ")
+            << Func->getTarget()->getRegName(AssignedRegNum, IceType_i32)
+            << "(r" << AssignedRegNum << ") to ";
        Item->dump(Func);
        Str << "\n";
      }
    }
-    Item->setRegNum(Item->getRegNumTmp());
+    Item->setRegNum(AssignedRegNum);
  }

  // TODO: Consider running register allocation one more time, with

--- a/src/IceRegAlloc.h
+++ b/src/IceRegAlloc.h
@@ -29,7 +29,7 @@ public:
  LinearScan(Cfg *Func)
      : Func(Func), FindPreference(false), FindOverlap(false) {}
  void init(RegAllocKind Kind);
-  void scan(const llvm::SmallBitVector &RegMask);
+  void scan(const llvm::SmallBitVector &RegMask, bool Randomized);
  void dump(Cfg *Func) const;

 private:

--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -29,6 +29,7 @@ namespace Ice {

 namespace {

+// TODO(stichnot): Move this machinery into llvm2ice.cpp.
 namespace cl = llvm::cl;
 cl::opt<bool> DoNopInsertion("nop-insertion", cl::desc("Randomly insert NOPs"),
                             cl::init(false));
@@ -40,6 +41,11 @@ cl::opt<int> MaxNopsPerInstruction(
 cl::opt<int> NopProbabilityAsPercentage(
    "nop-insertion-percentage",
    cl::desc("Nop insertion probability as percentage"), cl::init(10));
+
+cl::opt<bool>
+CLRandomizeRegisterAllocation("randomize-regalloc",
+                              cl::desc("Randomize register allocation"),
+                              cl::init(false));
 } // end of anonymous namespace

 void LoweringContext::init(CfgNode *N) {
@@ -95,6 +101,12 @@ TargetLowering *TargetLowering::createLowering(TargetArch Target, Cfg *Func) {
  return NULL;
 }

+TargetLowering::TargetLowering(Cfg *Func)
+    : Func(Func), Ctx(Func->getContext()),
+      RandomizeRegisterAllocation(CLRandomizeRegisterAllocation),
+      HasComputedFrame(false), CallsReturnsTwice(false), StackAdjustment(0),
+      Context() {}
+
 Assembler *TargetLowering::createAssembler(TargetArch Target, Cfg *Func) {
  // These statements can be #ifdef'd to specialize the assembler
  // to a subset of the available targets.  TODO: use CRTP.
@@ -236,7 +248,7 @@ void TargetLowering::regAlloc(RegAllocKind Kind) {
    RegExclude |= RegSet_FramePointer;
  LinearScan.init(Kind);
  llvm::SmallBitVector RegMask = getRegisterSet(RegInclude, RegExclude);
-  LinearScan.scan(RegMask);
+  LinearScan.scan(RegMask, RandomizeRegisterAllocation);
 }

 TargetGlobalInitLowering *

--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -191,6 +191,10 @@ public:
  virtual const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const = 0;
  void regAlloc(RegAllocKind Kind);

+  virtual void makeRandomRegisterPermutation(
+      llvm::SmallVectorImpl<int32_t> &Permutation,
+      const llvm::SmallBitVector &ExcludeRegisters) const = 0;
+
  virtual void emitVariable(const Variable *Var) const = 0;

  // Performs target-specific argument lowering.
@@ -204,9 +208,7 @@ public:
  virtual ~TargetLowering() {}

 protected:
-  TargetLowering(Cfg *Func)
-      : Func(Func), Ctx(Func->getContext()), HasComputedFrame(false),
-        CallsReturnsTwice(false), StackAdjustment(0) {}
+  TargetLowering(Cfg *Func);
  virtual void lowerAlloca(const InstAlloca *Inst) = 0;
  virtual void lowerArithmetic(const InstArithmetic *Inst) = 0;
  virtual void lowerAssign(const InstAssign *Inst) = 0;
@@ -235,6 +237,7 @@ protected:

  Cfg *Func;
  GlobalContext *Ctx;
+  const bool RandomizeRegisterAllocation;
  bool HasComputedFrame;
  bool CallsReturnsTwice;
  // StackAdjustment keeps track of the current stack offset from its

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -3869,7 +3869,7 @@ void TargetX8632::doAddressOptLoad() {
 void TargetX8632::randomlyInsertNop(float Probability) {
  RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
  if (RNG.getTrueWithProbability(Probability)) {
-    _nop(RNG.next(X86_NUM_NOP_VARIANTS));
+    _nop(RNG(X86_NUM_NOP_VARIANTS));
  }
 }

@@ -4530,6 +4530,72 @@ void TargetX8632::postLower() {
  }
 }

+void TargetX8632::makeRandomRegisterPermutation(
+    llvm::SmallVectorImpl<int32_t> &Permutation,
+    const llvm::SmallBitVector &ExcludeRegisters) const {
+  // TODO(stichnot): Declaring Permutation this way loses type/size
+  // information.  Fix this in conjunction with the caller-side TODO.
+  assert(Permutation.size() >= RegX8632::Reg_NUM);
+  // Expected upper bound on the number of registers in a single
+  // equivalence class.  For x86-32, this would comprise the 8 XMM
+  // registers.  This is for performance, not correctness.
+  static const unsigned MaxEquivalenceClassSize = 8;
+  typedef llvm::SmallVector<int32_t, MaxEquivalenceClassSize> RegisterList;
+  typedef std::map<uint32_t, RegisterList> EquivalenceClassMap;
+  EquivalenceClassMap EquivalenceClasses;
+  SizeT NumShuffled = 0, NumPreserved = 0;
+
+// Build up the equivalence classes of registers by looking at the
+// register properties as well as whether the registers should be
+// explicitly excluded from shuffling.
+#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
+          frameptr, isI8, isInt, isFP)                                         \
+  if (ExcludeRegisters[RegX8632::val]) {                                       \
+    /* val stays the same in the resulting permutation. */                     \
+    Permutation[RegX8632::val] = RegX8632::val;                                \
+    ++NumPreserved;                                                            \
+  } else {                                                                     \
+    const uint32_t Index = (scratch << 0) | (preserved << 1) | (isI8 << 2) |   \
+                           (isInt << 3) | (isFP << 4);                         \
+    /* val is assigned to an equivalence class based on its properties. */     \
+    EquivalenceClasses[Index].push_back(RegX8632::val);                        \
+  }
+  REGX8632_TABLE
+#undef X
+
+  RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
+
+  // Shuffle the resulting equivalence classes.
+  for (auto I : EquivalenceClasses) {
+    const RegisterList &List = I.second;
+    RegisterList Shuffled(List);
+    std::random_shuffle(Shuffled.begin(), Shuffled.end(), RNG);
+    for (size_t SI = 0, SE = Shuffled.size(); SI < SE; ++SI) {
+      Permutation[List[SI]] = Shuffled[SI];
+      ++NumShuffled;
+    }
+  }
+
+  assert(NumShuffled + NumPreserved == RegX8632::Reg_NUM);
+
+  if (Func->getContext()->isVerbose(IceV_Random)) {
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << "Register equivalence classes:\n";
+    for (auto I : EquivalenceClasses) {
+      Str << "{";
+      const RegisterList &List = I.second;
+      bool First = true;
+      for (int32_t Register : List) {
+        if (!First)
+          Str << " ";
+        First = false;
+        Str << getRegName(Register, IceType_i32);
+      }
+      Str << "}\n";
+    }
+  }
+}
+
 template <> void ConstantInteger32::emit(GlobalContext *Ctx) const {
  if (!ALLOW_DUMP)
    return;

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -180,6 +180,10 @@ protected:
  OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
                                                uint32_t Offset = 0);

+  void makeRandomRegisterPermutation(
+      llvm::SmallVectorImpl<int32_t> &Permutation,
+      const llvm::SmallBitVector &ExcludeRegisters) const;
+
  // The following are helpers that insert lowered x86 instructions
  // with minimal syntactic overhead, so that the lowering code can
  // look as close to assembly as practical.

--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -47,6 +47,7 @@ static cl::list<Ice::VerboseItem> VerboseList(
        clEnumValN(Ice::IceV_LinearScan, "regalloc", "Linear scan details"),
        clEnumValN(Ice::IceV_Frame, "frame", "Stack frame layout details"),
        clEnumValN(Ice::IceV_AddrOpt, "addropt", "Address mode optimization"),
+        clEnumValN(Ice::IceV_Random, "random", "Randomization details"),
        clEnumValN(Ice::IceV_All, "all", "Use all verbose options"),
        clEnumValN(Ice::IceV_Most, "most",
                   "Use all verbose options except 'regalloc' and 'time'"),

--- a/tests_lit/llvm2ice_tests/randomize-regalloc.ll
+++ b/tests_lit/llvm2ice_tests/randomize-regalloc.ll
+; This is a smoke test of randomized register allocation.  The output
+; of this test will change with changes to the random number generator
+; implementation.
+
+; RUN: %p2i -i %s --args -O2 -sz-seed=1 -randomize-regalloc \
+; RUN:   | llvm-mc -triple=i686-none-nacl -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - \
+; RUN:   | FileCheck %s --check-prefix=CHECK_1
+; RUN: %p2i -i %s --args -Om1 -sz-seed=1 -randomize-regalloc \
+; RUN:   | llvm-mc -triple=i686-none-nacl -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - \
+; RUN:   | FileCheck %s --check-prefix=OPTM1_1
+
+; Same tests but with a different seed, just to verify randomness.
+; RUN: %p2i -i %s --args -O2 -sz-seed=123 -randomize-regalloc \
+; RUN:   | llvm-mc -triple=i686-none-nacl -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - \
+; RUN:   | FileCheck %s --check-prefix=CHECK_123
+; RUN: %p2i -i %s --args -Om1 -sz-seed=123 -randomize-regalloc \
+; RUN:   | llvm-mc -triple=i686-none-nacl -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - \
+; RUN:   | FileCheck %s --check-prefix=OPTM1_123
+
+define <4 x i32> @mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %res = mul <4 x i32> %a, %b
+  ret <4 x i32> %res
+; OPTM1_1-LABEL: mul_v4i32:
+; OPTM1_1: sub     esp, 60
+; OPTM1_1-NEXT: movups  xmmword ptr [esp + 32], xmm0
+; OPTM1_1-NEXT: movups  xmmword ptr [esp + 16], xmm1
+; OPTM1_1-NEXT: movups  xmm0, xmmword ptr [esp + 32]
+; OPTM1_1-NEXT: pshufd  xmm7, xmmword ptr [esp + 32], 49
+; OPTM1_1-NEXT: pshufd  xmm4, xmmword ptr [esp + 16], 49
+; OPTM1_1-NEXT: pmuludq xmm0, xmmword ptr [esp + 16]
+; OPTM1_1-NEXT: pmuludq xmm7, xmm4
+; OPTM1_1-NEXT: shufps  xmm0, xmm7, -120
+; OPTM1_1-NEXT: pshufd  xmm0, xmm0, -40
+; OPTM1_1-NEXT: movups  xmmword ptr [esp], xmm0
+; OPTM1_1-NEXT: movups  xmm0, xmmword ptr [esp]
+; OPTM1_1-NEXT: add     esp, 60
+; OPTM1_1-NEXT: ret
+
+; CHECK_1-LABEL: mul_v4i32:
+; CHECK_1: movups  xmm6, xmm0
+; CHECK_1-NEXT: pshufd  xmm0, xmm0, 49
+; CHECK_1-NEXT: pshufd  xmm5, xmm1, 49
+; CHECK_1-NEXT: pmuludq xmm6, xmm1
+; CHECK_1-NEXT: pmuludq xmm0, xmm5
+; CHECK_1-NEXT: shufps  xmm6, xmm0, -120
+; CHECK_1-NEXT: pshufd  xmm6, xmm6, -40
+; CHECK_1-NEXT: movups  xmm0, xmm6
+; CHECK_1-NEXT: ret
+
+; OPTM1_123-LABEL: mul_v4i32:
+; OPTM1_123: sub     esp, 60
+; OPTM1_123-NEXT: movups  xmmword ptr [esp + 32], xmm0
+; OPTM1_123-NEXT: movups  xmmword ptr [esp + 16], xmm1
+; OPTM1_123-NEXT: movups  xmm0, xmmword ptr [esp + 32]
+; OPTM1_123-NEXT: pshufd  xmm2, xmmword ptr [esp + 32], 49
+; OPTM1_123-NEXT: pshufd  xmm6, xmmword ptr [esp + 16], 49
+; OPTM1_123-NEXT: pmuludq xmm0, xmmword ptr [esp + 16]
+; OPTM1_123-NEXT: pmuludq xmm2, xmm6
+; OPTM1_123-NEXT: shufps  xmm0, xmm2, -120
+; OPTM1_123-NEXT: pshufd  xmm0, xmm0, -40
+; OPTM1_123-NEXT: movups  xmmword ptr [esp], xmm0
+; OPTM1_123-NEXT: movups  xmm0, xmmword ptr [esp]
+; OPTM1_123-NEXT: add     esp, 60
+; OPTM1_123-NEXT: ret
+
+; CHECK_123-LABEL: mul_v4i32:
+; CHECK_123: movups  xmm3, xmm0
+; CHECK_123-NEXT: pshufd  xmm0, xmm0, 49
+; CHECK_123-NEXT: pshufd  xmm7, xmm1, 49
+; CHECK_123-NEXT: pmuludq xmm3, xmm1
+; CHECK_123-NEXT: pmuludq xmm0, xmm7
+; CHECK_123-NEXT: shufps  xmm3, xmm0, -120
+; CHECK_123-NEXT: pshufd  xmm3, xmm3, -40
+; CHECK_123-NEXT: movups  xmm0, xmm3
+; CHECK_123-NEXT: ret
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ