Subzero: Improve/refactor folding loads into the next instruction.

This is turned into a separate (O2-only) pass that looks for opportunities: 1. A Load instruction, or an AtomicLoad intrinsic that would be lowered just like a Load instruction 2. Followed immediately by an instruction with a whitelisted kind that uses the Load dest variable as one of its operands 3. Where the whitelisted instruction ends the live range of the Load dest variable. In such cases, the original two instructions are deleted and a new instruction is added that folds the load into the whitelisted instruction. We also do some work to splice the liveness information (Inst::LiveRangesEnded and Inst::isLastUse()) into the new instruction, so that the target lowering pass might still take advantage. Currently this is used quite sparingly, but in the future we could use that along with operator commutativity to choose among different lowering sequences to reduce register pressure. The whitelisted instruction kinds are chosen based primarily on whether the main operation's native instruction can use a memory operand - e.g., arithmetic (add/sub/imul/etc), compare (cmp/ucomiss), cast (movsx/movzx/etc). Notably, call and ret are not included because arg passing is done through simple assignments which normal lowering is sufficient for. BUG= none R=jvoung@chromium.org, mtrofin@chromium.org Review URL: https://codereview.chromium.org/1169493002

Subzero: Improve/refactor folding loads into the next instruction.
8e6bf6e1 · Jim Stichnoth · bb9d11a5 · 8e6bf6e1 · 8e6bf6e1 · 8e6bf6e1
Commit 8e6bf6e1 authored Jun 03, 2015 by Jim Stichnoth
9 changed files
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
--- a/src/IceELFObjectWriter.cpp
+++ b/src/IceELFObjectWriter.cpp
@@ -383,8 +383,9 @@ void ELFObjectWriter::writeDataOfType(SectionType ST,
      for (VariableDeclaration::Initializer *Init : Var->getInitializers()) {
        switch (Init->getKind()) {
        case VariableDeclaration::Initializer::DataInitializerKind: {
-          const auto Data = llvm::cast<VariableDeclaration::DataInitializer>(
+          const auto Data =
-                                Init)->getContents();
+              llvm::cast<VariableDeclaration::DataInitializer>(Init)
+                  ->getContents();
          Section->appendData(Str, llvm::StringRef(Data.data(), Data.size()));
          break;
        }

--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -112,6 +112,44 @@ bool Inst::isLastUse(const Operand *TestSrc) const {
  return false;
 }
+// Given an instruction like:
+//   a = b + c + [x,y] + e
+// which was created from OrigInst:
+//   a = b + c + d + e
+// with SpliceAssn spliced in:
+//   d = [x,y]
+//
+// Reconstruct the LiveRangesEnded bitmask in this instruction by
+// combining the LiveRangesEnded values of OrigInst and SpliceAssn.
+// If operands d and [x,y] contain a different number of variables,
+// then the bitmask position for e may be different in OrigInst and
+// the current instruction, requiring extra shifts and masks in the
+// computation.  In the example above, OrigInst has variable e in bit
+// position 3, whereas the current instruction has e in bit position 4
+// because [x,y] consumes 2 bitmask slots while d only consumed 1.
+//
+// Additionally, set HasSideEffects if either OrigInst or SpliceAssn
+// have HasSideEffects set.
+void Inst::spliceLivenessInfo(Inst *OrigInst, Inst *SpliceAssn) {
+  HasSideEffects |= OrigInst->HasSideEffects;
+  HasSideEffects |= SpliceAssn->HasSideEffects;
+  // Find the bitmask index of SpliceAssn's dest within OrigInst.
+  Variable *SpliceDest = SpliceAssn->getDest();
+  SizeT Index = 0;
+  for (SizeT I = 0; I < OrigInst->getSrcSize(); ++I) {
+    Operand *Src = OrigInst->getSrc(I);
+    if (Src == SpliceDest) {
+      LREndedBits LeftMask = OrigInst->LiveRangesEnded & ((1 << Index) - 1);
+      LREndedBits RightMask = OrigInst->LiveRangesEnded >> (Index + 1);
+      LiveRangesEnded = LeftMask | (SpliceAssn->LiveRangesEnded << Index) |
+                        (RightMask << (Index + getSrc(I)->getNumVars()));
+      return;
+    }
+    Index += getSrc(I)->getNumVars();
+  }
+  llvm::report_fatal_error("Failed to find splice operand");
+}
 void Inst::livenessLightweight(Cfg *Func, LivenessBV &Live) {
  assert(!isDeleted());
  resetLastUses();

--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -102,6 +102,7 @@ public:
  }
  bool isLastUse(const Operand *Src) const;
+  void spliceLivenessInfo(Inst *OrigInst, Inst *SpliceAssn);
  // Returns a list of out-edges corresponding to a terminator
  // instruction, which is the last instruction of the block.

--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -482,6 +482,7 @@ void TargetX8632::translateO2() {
    return;
  Func->dump("After x86 address mode opt");
+  doLoadOpt();
  Func->genCode();
  if (Func->hasError())
    return;
@@ -572,6 +573,126 @@ void TargetX8632::translateOm1() {
  }
 }
+namespace {
+// Converts a ConstantInteger32 operand into its constant value, or
+// MemoryOrderInvalid if the operand is not a ConstantInteger32.
+uint64_t getConstantMemoryOrder(Operand *Opnd) {
+  if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
+    return Integer->getValue();
+  return Intrinsics::MemoryOrderInvalid;
+}
+// Determines whether the dest of a Load instruction can be folded
+// into one of the src operands of a 2-operand instruction.  This is
+// true as long as the load dest matches exactly one of the binary
+// instruction's src operands.  Replaces Src0 or Src1 with LoadSrc if
+// the answer is true.
+bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
+                               Operand *&Src0, Operand *&Src1) {
+  if (Src0 == LoadDest && Src1 != LoadDest) {
+    Src0 = LoadSrc;
+    return true;
+  }
+  if (Src0 != LoadDest && Src1 == LoadDest) {
+    Src1 = LoadSrc;
+    return true;
+  }
+  return false;
+}
+} // end of anonymous namespace
+void TargetX8632::doLoadOpt() {
+  for (CfgNode *Node : Func->getNodes()) {
+    Context.init(Node);
+    while (!Context.atEnd()) {
+      Variable *LoadDest = nullptr;
+      Operand *LoadSrc = nullptr;
+      Inst *CurInst = Context.getCur();
+      Inst *Next = Context.getNextInst();
+      // Determine whether the current instruction is a Load
+      // instruction or equivalent.
+      if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
+        // An InstLoad always qualifies.
+        LoadDest = Load->getDest();
+        const bool DoLegalize = false;
+        LoadSrc = formMemoryOperand(Load->getSourceAddress(),
+                                    LoadDest->getType(), DoLegalize);
+      } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
+        // An AtomicLoad intrinsic qualifies as long as it has a valid
+        // memory ordering, and can be implemented in a single
+        // instruction (i.e., not i64).
+        Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
+        if (ID == Intrinsics::AtomicLoad &&
+            Intrin->getDest()->getType() != IceType_i64 &&
+            Intrinsics::isMemoryOrderValid(
+                ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
+          LoadDest = Intrin->getDest();
+          const bool DoLegalize = false;
+          LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
+                                      DoLegalize);
+        }
+      }
+      // A Load instruction can be folded into the following
+      // instruction only if the following instruction ends the Load's
+      // Dest variable's live range.
+      if (LoadDest && Next && Next->isLastUse(LoadDest)) {
+        assert(LoadSrc);
+        Inst *NewInst = nullptr;
+        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
+          Operand *Src0 = Arith->getSrc(0);
+          Operand *Src1 = Arith->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstArithmetic::create(Func, Arith->getOp(),
+                                             Arith->getDest(), Src0, Src1);
+          }
+        } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
+          Operand *Src0 = Icmp->getSrc(0);
+          Operand *Src1 = Icmp->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstIcmp::create(Func, Icmp->getCondition(),
+                                       Icmp->getDest(), Src0, Src1);
+          }
+        } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
+          Operand *Src0 = Fcmp->getSrc(0);
+          Operand *Src1 = Fcmp->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
+                                       Fcmp->getDest(), Src0, Src1);
+          }
+        } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
+          Operand *Src0 = Select->getTrueOperand();
+          Operand *Src1 = Select->getFalseOperand();
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstSelect::create(Func, Select->getDest(),
+                                         Select->getCondition(), Src0, Src1);
+          }
+        } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
+          // The load dest can always be folded into a Cast
+          // instruction.
+          Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
+          if (Src0 == LoadDest) {
+            NewInst = InstCast::create(Func, Cast->getCastKind(),
+                                       Cast->getDest(), LoadSrc);
+          }
+        }
+        if (NewInst) {
+          CurInst->setDeleted();
+          Next->setDeleted();
+          Context.insert(NewInst);
+          // Update NewInst->LiveRangesEnded so that target lowering
+          // may benefit.  Also update NewInst->HasSideEffects.
+          NewInst->spliceLivenessInfo(Next, CurInst);
+        }
+      }
+      Context.advanceCur();
+      Context.advanceNext();
+    }
+  }
+  Func->dump("After load optimization");
+}
 bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
  if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) {
    return Br->optimizeBranch(NextNode);
@@ -1170,6 +1291,10 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
  Variable *Dest = Inst->getDest();
  Operand *Src0 = legalize(Inst->getSrc(0));
  Operand *Src1 = legalize(Inst->getSrc(1));
+  if (Inst->isCommutative()) {
+    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
+      std::swap(Src0, Src1);
+  }
  if (Dest->getType() == IceType_i64) {
    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
@@ -2891,18 +3016,6 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
  }
 }
-namespace {
-// Converts a ConstantInteger32 operand into its constant value, or
-// MemoryOrderInvalid if the operand is not a ConstantInteger32.
-uint64_t getConstantMemoryOrder(Operand *Opnd) {
-  if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
-    return Integer->getValue();
-  return Intrinsics::MemoryOrderInvalid;
-}
-} // end of anonymous namespace
 void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
  switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
  case Intrinsics::AtomicCmpxchg: {
@@ -3006,9 +3119,10 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
      Func->setError("Unexpected memory ordering for AtomicRMW");
      return;
    }
-    lowerAtomicRMW(Instr->getDest(),
+    lowerAtomicRMW(
-                   static_cast<uint32_t>(llvm::cast<ConstantInteger32>(
+        Instr->getDest(),
-                                             Instr->getArg(0))->getValue()),
+        static_cast<uint32_t>(
+            llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
        Instr->getArg(1), Instr->getArg(2));
    return;
  case Intrinsics::AtomicStore: {
@@ -3852,66 +3966,9 @@ void TargetX8632::lowerLoad(const InstLoad *Load) {
  // OperandX8632Mem operand.  Note that the address mode
  // optimization already creates an OperandX8632Mem operand, so it
  // doesn't need another level of transformation.
-  Type Ty = Load->getDest()->getType();
-  Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
-  // Fuse this load with a subsequent Arithmetic instruction in the
-  // following situations:
-  //   a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b
-  //   a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
-  //
-  // Fuse this load with a subsequent Cast instruction:
-  //   a=[mem]; b=cast(a) ==> b=cast([mem]) if last use of a
-  //
-  // TODO: Clean up and test thoroughly.
-  // (E.g., if there is an mfence-all make sure the load ends up on the
-  // same side of the fence).
-  //
-  // TODO: Why limit to Arithmetic instructions?  This could probably be
-  // applied to most any instruction type.  Look at all source operands
-  // in the following instruction, and if there is one instance of the
-  // load instruction's dest variable, and that instruction ends that
-  // variable's live range, then make the substitution.  Deal with
-  // commutativity optimization in the arithmetic instruction lowering.
-  //
-  // TODO(stichnot): Do load fusing as a separate pass.  Run it before
-  // the bool folding pass.  Modify Ice::Inst to allow src operands to
-  // be replaced, including updating Inst::LiveRangesEnded, to avoid
-  // having to manually mostly clone each instruction type.
-  Inst *NextInst = Context.getNextInst();
  Variable *DestLoad = Load->getDest();
-  if (NextInst && NextInst->isLastUse(DestLoad)) {
+  Type Ty = DestLoad->getType();
-    if (auto *Arith = llvm::dyn_cast<InstArithmetic>(NextInst)) {
+  Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
-      InstArithmetic *NewArith = nullptr;
-      Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
-      Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
-      if (Src1Arith == DestLoad && DestLoad != Src0Arith) {
-        NewArith = InstArithmetic::create(
-            Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(0), Src0);
-      } else if (Src0Arith == DestLoad && Arith->isCommutative() &&
-                 DestLoad != Src1Arith) {
-        NewArith = InstArithmetic::create(
-            Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(1), Src0);
-      }
-      if (NewArith) {
-        Arith->setDeleted();
-        Context.advanceNext();
-        lowerArithmetic(NewArith);
-        return;
-      }
-    } else if (auto *Cast = llvm::dyn_cast<InstCast>(NextInst)) {
-      Variable *Src0Cast = llvm::dyn_cast<Variable>(Cast->getSrc(0));
-      if (Src0Cast == DestLoad) {
-        InstCast *NewCast =
-            InstCast::create(Func, Cast->getCastKind(), Cast->getDest(), Src0);
-        Cast->setDeleted();
-        Context.advanceNext();
-        lowerCast(NewCast);
-        return;
-      }
-    }
-  }
  InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
  lowerAssign(Assign);
 }
@@ -4639,7 +4696,8 @@ Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
  return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
 }
-OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty) {
+OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty,
+                                                bool DoLegalize) {
  OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand);
  // It may be the case that address mode optimization already creates
  // an OperandX8632Mem, so in that case it wouldn't need another level
@@ -4656,7 +4714,7 @@ OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty) {
    }
    Mem = OperandX8632Mem::create(Func, Ty, Base, Offset);
  }
-  return llvm::cast<OperandX8632Mem>(legalize(Mem));
+  return llvm::cast<OperandX8632Mem>(DoLegalize ? legalize(Mem) : Mem);
 }
 Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) {

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -101,6 +101,7 @@ public:
  void translateOm1() override;
  void translateO2() override;
+  void doLoadOpt();
  bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;
  SizeT getNumRegisters() const override { return RegX8632::Reg_NUM; }
@@ -229,7 +230,8 @@ protected:
  // Turn a pointer operand into a memory operand that can be
  // used by a real load/store operation. Legalizes the operand as well.
  // This is a nop if the operand is already a legal memory operand.
-  OperandX8632Mem *formMemoryOperand(Operand *Ptr, Type Ty);
+  OperandX8632Mem *formMemoryOperand(Operand *Ptr, Type Ty,
+                                     bool DoLegalize = true);
  Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
  static Type stackSlotType();

--- a/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
@@ -14,7 +14,7 @@ declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
 @g32_c = internal global [4 x i8] zeroinitializer, align 4
 @g32_d = internal global [4 x i8] zeroinitializer, align 4
-define i32 @test_fused_load_add_a() {
+define i32 @test_fused_load_sub_a() {
 entry:
  %p_alloca = alloca i8, i32 4, align 4
  %p_alloca_bc = bitcast i8* %p_alloca to i32*
@@ -22,39 +22,39 @@ entry:
  %p_a = bitcast [4 x i8]* @g32_a to i32*
  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
-  %l_a2 = add i32 %l_a, 1
+  %l_a2 = sub i32 1, %l_a
  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
  %p_b = bitcast [4 x i8]* @g32_b to i32*
  %l_b = load i32, i32* %p_b, align 1
-  %l_b2 = add i32 %l_b, 1
+  %l_b2 = sub i32 1, %l_b
  store i32 %l_b2, i32* %p_b, align 1
  %p_c = bitcast [4 x i8]* @g32_c to i32*
  %l_c = load i32, i32* %p_c, align 1
-  %l_c2 = add i32 %l_c, 1
+  %l_c2 = sub i32 1, %l_c
  call void @llvm.nacl.atomic.fence.all()
  store i32 %l_c2, i32* %p_c, align 1
  ret i32 %l_c2
 }
-; CHECK-LABEL: test_fused_load_add_a
+; CHECK-LABEL: test_fused_load_sub_a
 ;    alloca store
 ; CHECK: mov {{.*}},esp
 ; CHECK: mov DWORD PTR {{.*}},0x3e7
 ;    atomic store (w/ its own mfence)
-; The load + add are optimized into one everywhere.
+; The load + sub are optimized into one everywhere.
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
 ; CHECK: mov DWORD PTR
 ; CHECK: mfence
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b
 ; CHECK: mov DWORD PTR
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
 ; CHECK: mfence
 ; CHECK: mov DWORD PTR
 ; Test with the fence moved up a bit.
-define i32 @test_fused_load_add_b() {
+define i32 @test_fused_load_sub_b() {
 entry:
  %p_alloca = alloca i8, i32 4, align 4
  %p_alloca_bc = bitcast i8* %p_alloca to i32*
@@ -62,40 +62,40 @@ entry:
  %p_a = bitcast [4 x i8]* @g32_a to i32*
  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
-  %l_a2 = add i32 %l_a, 1
+  %l_a2 = sub i32 1, %l_a
  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
  %p_b = bitcast [4 x i8]* @g32_b to i32*
  %l_b = load i32, i32* %p_b, align 1
-  %l_b2 = add i32 %l_b, 1
+  %l_b2 = sub i32 1, %l_b
  store i32 %l_b2, i32* %p_b, align 1
  %p_c = bitcast [4 x i8]* @g32_c to i32*
  call void @llvm.nacl.atomic.fence.all()
  %l_c = load i32, i32* %p_c, align 1
-  %l_c2 = add i32 %l_c, 1
+  %l_c2 = sub i32 1, %l_c
  store i32 %l_c2, i32* %p_c, align 1
  ret i32 %l_c2
 }
-; CHECK-LABEL: test_fused_load_add_b
+; CHECK-LABEL: test_fused_load_sub_b
 ;    alloca store
 ; CHECK: mov {{.*}},esp
 ; CHECK: mov DWORD PTR {{.*}},0x3e7
 ;    atomic store (w/ its own mfence)
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
 ; CHECK: mov DWORD PTR
 ; CHECK: mfence
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b
 ; CHECK: mov DWORD PTR
 ; CHECK: mfence
-; Load + add can still be optimized into one instruction
+; Load + sub can still be optimized into one instruction
 ; because it is not separated by a fence.
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
 ; CHECK: mov DWORD PTR
-; Test with the fence splitting a load/add.
+; Test with the fence splitting a load/sub.
-define i32 @test_fused_load_add_c() {
+define i32 @test_fused_load_sub_c() {
 entry:
  %p_alloca = alloca i8, i32 4, align 4
  %p_alloca_bc = bitcast i8* %p_alloca to i32*
@@ -103,38 +103,39 @@ entry:
  %p_a = bitcast [4 x i8]* @g32_a to i32*
  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
-  %l_a2 = add i32 %l_a, 1
+  %l_a2 = sub i32 1, %l_a
  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
  %p_b = bitcast [4 x i8]* @g32_b to i32*
  %l_b = load i32, i32* %p_b, align 1
  call void @llvm.nacl.atomic.fence.all()
-  %l_b2 = add i32 %l_b, 1
+  %l_b2 = sub i32 1, %l_b
  store i32 %l_b2, i32* %p_b, align 1
  %p_c = bitcast [4 x i8]* @g32_c to i32*
  %l_c = load i32, i32* %p_c, align 1
-  %l_c2 = add i32 %l_c, 1
+  %l_c2 = sub i32 1, %l_c
  store i32 %l_c2, i32* %p_c, align 1
  ret i32 %l_c2
 }
-; CHECK-LABEL: test_fused_load_add_c
+; CHECK-LABEL: test_fused_load_sub_c
 ;    alloca store
 ; CHECK: mov {{.*}},esp
 ; CHECK: mov DWORD PTR {{.*}},0x3e7
 ;    atomic store (w/ its own mfence)
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
 ; CHECK: mov DWORD PTR
 ; CHECK: mfence
-; This load + add are no longer optimized into one,
+; This load + sub are no longer optimized into one,
 ; though perhaps it should be legal as long as
 ; the load stays on the same side of the fence.
 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_b
 ; CHECK: mfence
-; CHECK: add {{.*}},0x1
+; CHECK: mov {{.*}},0x1
+; CHECK: sub
 ; CHECK: mov DWORD PTR
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
 ; CHECK: mov DWORD PTR

--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -95,17 +95,17 @@ entry:
 next:
  %ptr = inttoptr i32 %iptr to i32*
  %r = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
-  %r2 = add i32 %r, 32
+  %r2 = sub i32 32, %r
  ret i32 %r2
 }
 ; CHECK-LABEL: test_atomic_load_32_with_arith
 ; CHECK: mov {{.*}},DWORD
 ; The next instruction may be a separate load or folded into an add.
 ;
-; In O2 mode, we know that the load and add are going to be fused.
+; In O2 mode, we know that the load and sub are going to be fused.
 ; O2-LABEL: test_atomic_load_32_with_arith
 ; O2: mov {{.*}},DWORD
-; O2: add {{.*}},DWORD
+; O2: sub {{.*}},DWORD
 define i32 @test_atomic_load_32_ignored(i32 %iptr) {
 entry: