Subzero: Improve/refactor folding loads into the next instruction.

This is turned into a separate (O2-only) pass that looks for opportunities: 1. A Load instruction, or an AtomicLoad intrinsic that would be lowered just like a Load instruction 2. Followed immediately by an instruction with a whitelisted kind that uses the Load dest variable as one of its operands 3. Where the whitelisted instruction ends the live range of the Load dest variable. In such cases, the original two instructions are deleted and a new instruction is added that folds the load into the whitelisted instruction. We also do some work to splice the liveness information (Inst::LiveRangesEnded and Inst::isLastUse()) into the new instruction, so that the target lowering pass might still take advantage. Currently this is used quite sparingly, but in the future we could use that along with operator commutativity to choose among different lowering sequences to reduce register pressure. The whitelisted instruction kinds are chosen based primarily on whether the main operation's native instruction can use a memory operand - e.g., arithmetic (add/sub/imul/etc), compare (cmp/ucomiss), cast (movsx/movzx/etc). Notably, call and ret are not included because arg passing is done through simple assignments which normal lowering is sufficient for. BUG= none R=jvoung@chromium.org, mtrofin@chromium.org Review URL: https://codereview.chromium.org/1169493002

Subzero: Improve/refactor folding loads into the next instruction.
8e6bf6e1 · Jim Stichnoth · bb9d11a5 · 8e6bf6e1 · 8e6bf6e1 · 8e6bf6e1
Commit 8e6bf6e1 authored Jun 03, 2015 by Jim Stichnoth
9 changed files
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -809,8 +809,8 @@ void emitRegisterUsage(Ostream &Str, const Cfg *Func, const CfgNode *Node,
    // familiar order.
    std::sort(LiveRegs.begin(), LiveRegs.end(),
              [](const Variable *V1, const Variable *V2) {
-      return V1->getRegNum() < V2->getRegNum();
-    });
+                return V1->getRegNum() < V2->getRegNum();
+              });
    bool First = true;
    for (Variable *Var : LiveRegs) {
      if (!First)

--- a/src/IceELFObjectWriter.cpp
+++ b/src/IceELFObjectWriter.cpp
@@ -383,8 +383,9 @@ void ELFObjectWriter::writeDataOfType(SectionType ST,
      for (VariableDeclaration::Initializer *Init : Var->getInitializers()) {
        switch (Init->getKind()) {
        case VariableDeclaration::Initializer::DataInitializerKind: {
-          const auto Data = llvm::cast<VariableDeclaration::DataInitializer>(
-                                Init)->getContents();
+          const auto Data =
+              llvm::cast<VariableDeclaration::DataInitializer>(Init)
+                  ->getContents();
          Section->appendData(Str, llvm::StringRef(Data.data(), Data.size()));
          break;
        }

--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -112,6 +112,44 @@ bool Inst::isLastUse(const Operand *TestSrc) const {
  return false;
 }

+// Given an instruction like:
+//   a = b + c + [x,y] + e
+// which was created from OrigInst:
+//   a = b + c + d + e
+// with SpliceAssn spliced in:
+//   d = [x,y]
+//
+// Reconstruct the LiveRangesEnded bitmask in this instruction by
+// combining the LiveRangesEnded values of OrigInst and SpliceAssn.
+// If operands d and [x,y] contain a different number of variables,
+// then the bitmask position for e may be different in OrigInst and
+// the current instruction, requiring extra shifts and masks in the
+// computation.  In the example above, OrigInst has variable e in bit
+// position 3, whereas the current instruction has e in bit position 4
+// because [x,y] consumes 2 bitmask slots while d only consumed 1.
+//
+// Additionally, set HasSideEffects if either OrigInst or SpliceAssn
+// have HasSideEffects set.
+void Inst::spliceLivenessInfo(Inst *OrigInst, Inst *SpliceAssn) {
+  HasSideEffects |= OrigInst->HasSideEffects;
+  HasSideEffects |= SpliceAssn->HasSideEffects;
+  // Find the bitmask index of SpliceAssn's dest within OrigInst.
+  Variable *SpliceDest = SpliceAssn->getDest();
+  SizeT Index = 0;
+  for (SizeT I = 0; I < OrigInst->getSrcSize(); ++I) {
+    Operand *Src = OrigInst->getSrc(I);
+    if (Src == SpliceDest) {
+      LREndedBits LeftMask = OrigInst->LiveRangesEnded & ((1 << Index) - 1);
+      LREndedBits RightMask = OrigInst->LiveRangesEnded >> (Index + 1);
+      LiveRangesEnded = LeftMask | (SpliceAssn->LiveRangesEnded << Index) |
+                        (RightMask << (Index + getSrc(I)->getNumVars()));
+      return;
+    }
+    Index += getSrc(I)->getNumVars();
+  }
+  llvm::report_fatal_error("Failed to find splice operand");
+}
+
 void Inst::livenessLightweight(Cfg *Func, LivenessBV &Live) {
  assert(!isDeleted());
  resetLastUses();

--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -102,6 +102,7 @@ public:
  }

  bool isLastUse(const Operand *Src) const;
+  void spliceLivenessInfo(Inst *OrigInst, Inst *SpliceAssn);

  // Returns a list of out-edges corresponding to a terminator
  // instruction, which is the last instruction of the block.

--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -251,9 +251,9 @@ void TargetLowering::sortVarsByAlignment(VarList &Dest,
  // as the buckets, if performance is an issue.
  std::sort(Dest.begin(), Dest.end(),
            [this](const Variable *V1, const Variable *V2) {
-    return typeWidthInBytesOnStack(V1->getType()) >
-           typeWidthInBytesOnStack(V2->getType());
-  });
+              return typeWidthInBytesOnStack(V1->getType()) >
+                     typeWidthInBytesOnStack(V2->getType());
+            });
 }

 void TargetLowering::getVarStackSlotParams(

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -101,6 +101,7 @@ public:

  void translateOm1() override;
  void translateO2() override;
+  void doLoadOpt();
  bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;

  SizeT getNumRegisters() const override { return RegX8632::Reg_NUM; }
@@ -229,7 +230,8 @@ protected:
  // Turn a pointer operand into a memory operand that can be
  // used by a real load/store operation. Legalizes the operand as well.
  // This is a nop if the operand is already a legal memory operand.
-  OperandX8632Mem *formMemoryOperand(Operand *Ptr, Type Ty);
+  OperandX8632Mem *formMemoryOperand(Operand *Ptr, Type Ty,
+                                     bool DoLegalize = true);

  Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
  static Type stackSlotType();

--- a/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
@@ -14,7 +14,7 @@ declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
 @g32_c = internal global [4 x i8] zeroinitializer, align 4
 @g32_d = internal global [4 x i8] zeroinitializer, align 4

-define i32 @test_fused_load_add_a() {
+define i32 @test_fused_load_sub_a() {
 entry:
  %p_alloca = alloca i8, i32 4, align 4
  %p_alloca_bc = bitcast i8* %p_alloca to i32*
@@ -22,39 +22,39 @@ entry:

  %p_a = bitcast [4 x i8]* @g32_a to i32*
  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
-  %l_a2 = add i32 %l_a, 1
+  %l_a2 = sub i32 1, %l_a
  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)

  %p_b = bitcast [4 x i8]* @g32_b to i32*
  %l_b = load i32, i32* %p_b, align 1
-  %l_b2 = add i32 %l_b, 1
+  %l_b2 = sub i32 1, %l_b
  store i32 %l_b2, i32* %p_b, align 1

  %p_c = bitcast [4 x i8]* @g32_c to i32*
  %l_c = load i32, i32* %p_c, align 1
-  %l_c2 = add i32 %l_c, 1
+  %l_c2 = sub i32 1, %l_c
  call void @llvm.nacl.atomic.fence.all()
  store i32 %l_c2, i32* %p_c, align 1

  ret i32 %l_c2
 }
-; CHECK-LABEL: test_fused_load_add_a
+; CHECK-LABEL: test_fused_load_sub_a
 ;    alloca store
 ; CHECK: mov {{.*}},esp
 ; CHECK: mov DWORD PTR {{.*}},0x3e7
 ;    atomic store (w/ its own mfence)
-; The load + add are optimized into one everywhere.
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; The load + sub are optimized into one everywhere.
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
 ; CHECK: mov DWORD PTR
 ; CHECK: mfence
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b
 ; CHECK: mov DWORD PTR
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
 ; CHECK: mfence
 ; CHECK: mov DWORD PTR

 ; Test with the fence moved up a bit.
-define i32 @test_fused_load_add_b() {
+define i32 @test_fused_load_sub_b() {
 entry:
  %p_alloca = alloca i8, i32 4, align 4
  %p_alloca_bc = bitcast i8* %p_alloca to i32*
@@ -62,40 +62,40 @@ entry:

  %p_a = bitcast [4 x i8]* @g32_a to i32*
  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
-  %l_a2 = add i32 %l_a, 1
+  %l_a2 = sub i32 1, %l_a
  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)

  %p_b = bitcast [4 x i8]* @g32_b to i32*
  %l_b = load i32, i32* %p_b, align 1
-  %l_b2 = add i32 %l_b, 1
+  %l_b2 = sub i32 1, %l_b
  store i32 %l_b2, i32* %p_b, align 1

  %p_c = bitcast [4 x i8]* @g32_c to i32*
  call void @llvm.nacl.atomic.fence.all()
  %l_c = load i32, i32* %p_c, align 1
-  %l_c2 = add i32 %l_c, 1
+  %l_c2 = sub i32 1, %l_c
  store i32 %l_c2, i32* %p_c, align 1

  ret i32 %l_c2
 }
-; CHECK-LABEL: test_fused_load_add_b
+; CHECK-LABEL: test_fused_load_sub_b
 ;    alloca store
 ; CHECK: mov {{.*}},esp
 ; CHECK: mov DWORD PTR {{.*}},0x3e7
 ;    atomic store (w/ its own mfence)
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
 ; CHECK: mov DWORD PTR
 ; CHECK: mfence
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b
 ; CHECK: mov DWORD PTR
 ; CHECK: mfence
-; Load + add can still be optimized into one instruction
+; Load + sub can still be optimized into one instruction
 ; because it is not separated by a fence.
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
 ; CHECK: mov DWORD PTR

-; Test with the fence splitting a load/add.
-define i32 @test_fused_load_add_c() {
+; Test with the fence splitting a load/sub.
+define i32 @test_fused_load_sub_c() {
 entry:
  %p_alloca = alloca i8, i32 4, align 4
  %p_alloca_bc = bitcast i8* %p_alloca to i32*
@@ -103,38 +103,39 @@ entry:

  %p_a = bitcast [4 x i8]* @g32_a to i32*
  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
-  %l_a2 = add i32 %l_a, 1
+  %l_a2 = sub i32 1, %l_a
  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)

  %p_b = bitcast [4 x i8]* @g32_b to i32*
  %l_b = load i32, i32* %p_b, align 1
  call void @llvm.nacl.atomic.fence.all()
-  %l_b2 = add i32 %l_b, 1
+  %l_b2 = sub i32 1, %l_b
  store i32 %l_b2, i32* %p_b, align 1

  %p_c = bitcast [4 x i8]* @g32_c to i32*
  %l_c = load i32, i32* %p_c, align 1
-  %l_c2 = add i32 %l_c, 1
+  %l_c2 = sub i32 1, %l_c
  store i32 %l_c2, i32* %p_c, align 1

  ret i32 %l_c2
 }
-; CHECK-LABEL: test_fused_load_add_c
+; CHECK-LABEL: test_fused_load_sub_c
 ;    alloca store
 ; CHECK: mov {{.*}},esp
 ; CHECK: mov DWORD PTR {{.*}},0x3e7
 ;    atomic store (w/ its own mfence)
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
 ; CHECK: mov DWORD PTR
 ; CHECK: mfence
-; This load + add are no longer optimized into one,
+; This load + sub are no longer optimized into one,
 ; though perhaps it should be legal as long as
 ; the load stays on the same side of the fence.
 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_b
 ; CHECK: mfence
-; CHECK: add {{.*}},0x1
+; CHECK: mov {{.*}},0x1
+; CHECK: sub
 ; CHECK: mov DWORD PTR
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
 ; CHECK: mov DWORD PTR



--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -95,17 +95,17 @@ entry:
 next:
  %ptr = inttoptr i32 %iptr to i32*
  %r = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
-  %r2 = add i32 %r, 32
+  %r2 = sub i32 32, %r
  ret i32 %r2
 }
 ; CHECK-LABEL: test_atomic_load_32_with_arith
 ; CHECK: mov {{.*}},DWORD
 ; The next instruction may be a separate load or folded into an add.
 ;
-; In O2 mode, we know that the load and add are going to be fused.
+; In O2 mode, we know that the load and sub are going to be fused.
 ; O2-LABEL: test_atomic_load_32_with_arith
 ; O2: mov {{.*}},DWORD
-; O2: add {{.*}},DWORD
+; O2: sub {{.*}},DWORD

 define i32 @test_atomic_load_32_ignored(i32 %iptr) {
 entry: