Add atomic load/store, fetch_add, fence, and is-lock-free lowering.

Loads/stores w/ type i8, i16, and i32 are converted to plain load/store instructions and lowered w/ the plain lowerLoad/lowerStore. Atomic stores are followed by an mfence for sequential consistency. For 64-bit types, use movq to do 64-bit memory loads/stores (vs the usual load/store being broken into separate 32-bit load/stores). This means bitcasting the i64 -> f64, first (which splits the load of the value to be stored into two 32-bit ops) then stores in a single op. For load, load into f64 then bitcast back to i64 (which splits after the atomic load). This follows what GCC does for c++11 std::atomic<uint64_t> load/store methods (uses movq when -mfpmath=sse). This introduces some redundancy between movq and movsd, but the convention seems to be to use movq when working with integer quantities. Otherwise, movsd could work too. The difference seems to be in whether or not the XMM register's upper 64-bits are filled with 0 or not. Zero-extending could help avoid partial register stalls. Handle up to i32 fetch_add. TODO: add i64 via a cmpxchg loop. TODO: add some runnable crosstests to make sure that this doesn't do funny things to integer bit patterns that happen to look like signaling NaNs and quiet NaNs. However, the system clang would not know how to handle "llvm.nacl.*" if we choose to target that level directly via .ll files. Or, (a) we use old-school __sync methods (sync_fetch_and_add w/ 0 to load) or (b) require buildbot's clang/gcc to support c++11... BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/342763004

Add atomic load/store, fetch_add, fence, and is-lock-free lowering.
5cd240df · Jan Voung · 1ee34165 · 5cd240df · 5cd240df · 5cd240df
Commit 5cd240df authored Jun 25, 2014 by Jan Voung
10 changed files
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -166,6 +166,11 @@ InstX8632Test::InstX8632Test(Cfg *Func, Operand *Src1, Operand *Src2)
  addSource(Src2);
 }

+InstX8632Mfence::InstX8632Mfence(Cfg *Func)
+    : InstX8632(Func, InstX8632::Mfence, 0, NULL) {
+  HasSideEffects = true;
+}
+
 InstX8632Store::InstX8632Store(Cfg *Func, Operand *Value, OperandX8632 *Mem)
    : InstX8632(Func, InstX8632::Store, 2, NULL) {
  addSource(Value);
@@ -177,6 +182,17 @@ InstX8632Mov::InstX8632Mov(Cfg *Func, Variable *Dest, Operand *Source)
  addSource(Source);
 }

+InstX8632StoreQ::InstX8632StoreQ(Cfg *Func, Operand *Value, OperandX8632 *Mem)
+    : InstX8632(Func, InstX8632::StoreQ, 2, NULL) {
+  addSource(Value);
+  addSource(Mem);
+}
+
+InstX8632Movq::InstX8632Movq(Cfg *Func, Variable *Dest, Operand *Source)
+    : InstX8632(Func, InstX8632::Movq, 1, Dest) {
+  addSource(Source);
+}
+
 InstX8632Movsx::InstX8632Movsx(Cfg *Func, Variable *Dest, Operand *Source)
    : InstX8632(Func, InstX8632::Movsx, 1, Dest) {
  addSource(Source);
@@ -221,12 +237,34 @@ bool InstX8632Mov::isRedundantAssign() const {
  return false;
 }

+bool InstX8632Movq::isRedundantAssign() const {
+  Variable *Src = llvm::dyn_cast<Variable>(getSrc(0));
+  if (Src == NULL)
+    return false;
+  if (getDest()->hasReg() && getDest()->getRegNum() == Src->getRegNum()) {
+    return true;
+  }
+  if (!getDest()->hasReg() && !Src->hasReg() &&
+      Dest->getStackOffset() == Src->getStackOffset())
+    return true;
+  return false;
+}
+
 InstX8632Ret::InstX8632Ret(Cfg *Func, Variable *Source)
    : InstX8632(Func, InstX8632::Ret, Source ? 1 : 0, NULL) {
  if (Source)
    addSource(Source);
 }

+InstX8632Xadd::InstX8632Xadd(Cfg *Func, Operand *Dest, Variable *Source,
+                             bool Locked)
+    : InstX8632(Func, InstX8632::Xadd, 2, llvm::dyn_cast<Variable>(Dest)),
+      Locked(Locked) {
+  HasSideEffects = Locked;
+  addSource(Dest);
+  addSource(Source);
+}
+
 // ======================== Dump routines ======================== //

 void InstX8632::dump(const Cfg *Func) const {
@@ -564,6 +602,17 @@ void InstX8632Test::dump(const Cfg *Func) const {
  dumpSources(Func);
 }

+void InstX8632Mfence::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 0);
+  Str << "\tmfence\n";
+}
+
+void InstX8632Mfence::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "mfence\n";
+}
+
 void InstX8632Store::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 2);
@@ -583,6 +632,26 @@ void InstX8632Store::dump(const Cfg *Func) const {
  getSrc(0)->dump(Func);
 }

+void InstX8632StoreQ::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  assert(getSrc(1)->getType() == IceType_i64 ||
+         getSrc(1)->getType() == IceType_f64);
+  Str << "\tmovq\t";
+  getSrc(1)->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632StoreQ::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "storeq." << getSrc(0)->getType() << " ";
+  getSrc(1)->dump(Func);
+  Str << ", ";
+  getSrc(0)->dump(Func);
+}
+
 void InstX8632Mov::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 1);
@@ -611,6 +680,26 @@ void InstX8632Mov::dump(const Cfg *Func) const {
  dumpSources(Func);
 }

+void InstX8632Movq::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  assert(getDest()->getType() == IceType_i64 ||
+         getDest()->getType() == IceType_f64);
+  Str << "\tmovq\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Movq::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "movq." << getDest()->getType() << " ";
+  dumpDest(Func);
+  Str << ", ";
+  dumpSources(Func);
+}
+
 void InstX8632Movsx::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 1);
@@ -773,6 +862,29 @@ void InstX8632Ret::dump(const Cfg *Func) const {
  dumpSources(Func);
 }

+void InstX8632Xadd::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  if (Locked) {
+    Str << "\tlock xadd ";
+  } else {
+    Str << "\txadd\t";
+  }
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Xadd::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (Locked) {
+    Str << "lock ";
+  }
+  Type Ty = getSrc(0)->getType();
+  Str << "xadd." << Ty << " ";
+  dumpSources(Func);
+}
+
 void OperandX8632::dump(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrDump();
  Str << "<OperandX8632>";

--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -151,7 +151,9 @@ public:
    Imul,
    Label,
    Load,
+    Mfence,
    Mov,
+    Movq,
    Movsx,
    Movzx,
    Mul,
@@ -167,11 +169,13 @@ public:
    Shr,
    Shrd,
    Store,
+    StoreQ,
    Sub,
    Subss,
    Test,
    Ucomiss,
    UD2,
+    Xadd,
    Xor
  };
  static const char *getWidthString(Type Ty);
@@ -578,6 +582,23 @@ private:
  virtual ~InstX8632Test() {}
 };

+// Mfence instruction.
+class InstX8632Mfence : public InstX8632 {
+public:
+  static InstX8632Mfence *create(Cfg *Func) {
+    return new (Func->allocate<InstX8632Mfence>()) InstX8632Mfence(Func);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Mfence); }
+
+private:
+  InstX8632Mfence(Cfg *Func);
+  InstX8632Mfence(const InstX8632Mfence &) LLVM_DELETED_FUNCTION;
+  InstX8632Mfence &operator=(const InstX8632Mfence &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Mfence() {}
+};
+
 // This is essentially a "mov" instruction with an OperandX8632Mem
 // operand instead of Variable as the destination.  It's important
 // for liveness that there is no Dest operand.
@@ -617,6 +638,45 @@ private:
  virtual ~InstX8632Mov() {}
 };

+// This is essentially a "movq" instruction with an OperandX8632Mem
+// operand instead of Variable as the destination.  It's important
+// for liveness that there is no Dest operand.
+class InstX8632StoreQ : public InstX8632 {
+public:
+  static InstX8632StoreQ *create(Cfg *Func, Operand *Value, OperandX8632 *Mem) {
+    return new (Func->allocate<InstX8632StoreQ>())
+        InstX8632StoreQ(Func, Value, Mem);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, StoreQ); }
+
+private:
+  InstX8632StoreQ(Cfg *Func, Operand *Value, OperandX8632 *Mem);
+  InstX8632StoreQ(const InstX8632StoreQ &) LLVM_DELETED_FUNCTION;
+  InstX8632StoreQ &operator=(const InstX8632StoreQ &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632StoreQ() {}
+};
+
+// Movq - copy between XMM registers, or mem64 and XMM registers.
+class InstX8632Movq : public InstX8632 {
+public:
+  static InstX8632Movq *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX8632Movq>())
+        InstX8632Movq(Func, Dest, Source);
+  }
+  virtual bool isRedundantAssign() const;
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Movq); }
+
+private:
+  InstX8632Movq(Cfg *Func, Variable *Dest, Operand *Source);
+  InstX8632Movq(const InstX8632Movq &) LLVM_DELETED_FUNCTION;
+  InstX8632Movq &operator=(const InstX8632Movq &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Movq() {}
+};
+
 // Movsx - copy from a narrower integer type to a wider integer
 // type, with sign extension.
 class InstX8632Movsx : public InstX8632 {
@@ -744,6 +804,33 @@ private:
  virtual ~InstX8632Ret() {}
 };

+// Exchanging Add instruction.  Exchanges the first operand (destination
+// operand) with the second operand (source operand), then loads the sum
+// of the two values into the destination operand. The destination may be
+// a register or memory, while the source must be a register.
+//
+// Both the dest and source are updated. The caller should then insert a
+// FakeDef to reflect the second udpate.
+class InstX8632Xadd : public InstX8632 {
+public:
+  static InstX8632Xadd *create(Cfg *Func, Operand *Dest, Variable *Source,
+                               bool Locked) {
+    return new (Func->allocate<InstX8632Xadd>())
+        InstX8632Xadd(Func, Dest, Source, Locked);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Xadd); }
+
+private:
+  bool Locked;
+
+  InstX8632Xadd(Cfg *Func, Operand *Dest, Variable *Source, bool Locked);
+  InstX8632Xadd(const InstX8632Xadd &) LLVM_DELETED_FUNCTION;
+  InstX8632Xadd &operator=(const InstX8632Xadd &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Xadd() {}
+};
+
 } // end of namespace Ice

 #endif // SUBZERO_SRC_ICEINSTX8632_H
--- a/src/IceIntrinsics.cpp
+++ b/src/IceIntrinsics.cpp
@@ -82,7 +82,7 @@ const struct IceIntrinsicsEntry_ {
  {                                                                            \
    {                                                                          \
      { Intrinsics::AtomicStore, true }                                        \
-      , { IceType_void, Overload, IceType_i32, IceType_i32 }, 5                \
+      , { IceType_void, Overload, IceType_i32, IceType_i32 }, 4                \
    }                                                                          \
    , "nacl.atomic.store." NameSuffix                                          \
  }
@@ -199,4 +199,9 @@ Intrinsics::find(const IceString &Name) const {
  return &it->second;
 }

+bool Intrinsics::VerifyMemoryOrder(uint64_t Order) {
+  // There is only one memory ordering for atomics allowed right now.
+  return Order == Intrinsics::MemoryOrderSequentiallyConsistent;
+}
+
 } // end of namespace Ice
--- a/src/IceIntrinsics.h
+++ b/src/IceIntrinsics.h
@@ -54,6 +54,39 @@ public:
    Trap
  };

+  /// Operations that can be represented by the AtomicRMW
+  /// intrinsic.
+  ///
+  /// Do not reorder these values: their order offers forward
+  /// compatibility of bitcode targeted to PNaCl.
+  enum AtomicRMWOperation {
+    AtomicInvalid = 0, // Invalid, keep first.
+    AtomicAdd,
+    AtomicSub,
+    AtomicOr,
+    AtomicAnd,
+    AtomicXor,
+    AtomicExchange,
+    AtomicNum // Invalid, keep last.
+  };
+
+  /// Memory orderings supported by PNaCl IR.
+  ///
+  /// Do not reorder these values: their order offers forward
+  /// compatibility of bitcode targeted to PNaCl.
+  enum MemoryOrder {
+    MemoryOrderInvalid = 0, // Invalid, keep first.
+    MemoryOrderRelaxed,
+    MemoryOrderConsume,
+    MemoryOrderAcquire,
+    MemoryOrderRelease,
+    MemoryOrderAcquireRelease,
+    MemoryOrderSequentiallyConsistent,
+    MemoryOrderNum // Invalid, keep last.
+  };
+
+  static bool VerifyMemoryOrder(uint64_t Order);
+
  // Basic attributes related to each intrinsic, that are relevant to
  // code generation. We will want to have more attributes (e.g., Setjmp
  // returns twice and which affects stack coloring) once the lowering

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -94,6 +94,9 @@ protected:
  virtual void doAddressOptLoad();
  virtual void doAddressOptStore();

+  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
+                      Operand *Val);
+
  // Operand legalization helpers.  To deal with address mode
  // constraints, the helpers will create a new Operand and emit
  // instructions that guarantee that the Operand kind is one of those
@@ -114,6 +117,10 @@ protected:
                    int32_t RegNum = Variable::NoRegister);
  Variable *legalizeToVar(Operand *From, bool AllowOverlap = false,
                          int32_t RegNum = Variable::NoRegister);
+  // Turn a pointer operand into a memory operand that can be
+  // used by a real load/store operation. Legalizes the operand as well.
+  // This is a nop if the operand is already a legal memory operand.
+  OperandX8632Mem *FormMemoryOperand(Operand *Ptr, Type Ty);

  Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
  InstCall *makeHelperCall(const IceString &Name, Variable *Dest,
@@ -180,6 +187,7 @@ protected:
  void _imul(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Imul::create(Func, Dest, Src0));
  }
+  void _mfence() { Context.insert(InstX8632Mfence::create(Func)); }
  // If Dest=NULL is passed in, then a new variable is created, marked
  // as infinite register allocation weight, and returned through the
  // in/out Dest argument.
@@ -191,6 +199,9 @@ protected:
      Context.insert(InstX8632Mov::create(Func, Dest, Src0));
    }
  }
+  void _movq(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movq::create(Func, Dest, Src0));
+  }
  void _movsx(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Movsx::create(Func, Dest, Src0));
  }
@@ -236,6 +247,9 @@ protected:
  void _store(Operand *Value, OperandX8632 *Mem) {
    Context.insert(InstX8632Store::create(Func, Value, Mem));
  }
+  void _storeq(Operand *Value, OperandX8632 *Mem) {
+    Context.insert(InstX8632StoreQ::create(Func, Value, Mem));
+  }
  void _sub(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Sub::create(Func, Dest, Src0));
  }
@@ -249,6 +263,12 @@ protected:
    Context.insert(InstX8632Ucomiss::create(Func, Src0, Src1));
  }
  void _ud2() { Context.insert(InstX8632UD2::create(Func)); }
+  void _xadd(Operand *Dest, Variable *Src, bool Locked) {
+    Context.insert(InstX8632Xadd::create(Func, Dest, Src, Locked));
+    // The xadd exchanges Dest and Src (modifying Src).
+    // Model that update with a FakeDef.
+    Context.insert(InstFakeDef::create(Func, Src));
+  }
  void _xor(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Xor::create(Func, Dest, Src0));
  }

--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -599,8 +599,6 @@ private:
      }
    }
    if (Call->getNumArgs() + 1 != I->NumTypes) {
-      std::cerr << "Call->getNumArgs() " << (int)Call->getNumArgs()
-                << " I->NumTypes " << (int)I->NumTypes << "\n";
      report_fatal_error("Mismatched # of args.");
    }
    for (size_t i = 1; i < I->NumTypes; ++i) {

--- a/tests_lit/llvm2ice_tests/nacl-atomic-errors.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-errors.ll
+; Test that some errors trigger when the usage of NaCl atomic
+; intrinsics does not match the required ABI.
+
+; RUN: not %llvm2ice --verbose none %s 2>&1 | FileCheck %s
+
+declare i8 @llvm.nacl.atomic.load.i8(i8*, i32)
+declare i16 @llvm.nacl.atomic.load.i16(i16*, i32)
+declare i64 @llvm.nacl.atomic.load.i64(i64*, i32)
+declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
+declare void @llvm.nacl.atomic.store.i64(i64, i64*, i32)
+declare i8 @llvm.nacl.atomic.rmw.i8(i32, i8*, i8, i32)
+declare i16 @llvm.nacl.atomic.rmw.i16(i32, i16*, i16, i32)
+declare i32 @llvm.nacl.atomic.rmw.i32(i32, i32*, i32, i32)
+declare i64 @llvm.nacl.atomic.rmw.i64(i32, i64*, i64, i32)
+declare i32 @llvm.nacl.atomic.cmpxchg.i32(i32*, i32, i32, i32, i32)
+declare i64 @llvm.nacl.atomic.cmpxchg.i64(i64*, i64, i64, i32, i32)
+declare void @llvm.nacl.atomic.fence(i32)
+declare i1 @llvm.nacl.atomic.is.lock.free(i32, i8*)
+
+;;; Load
+;;; Check unexpected memory order parameter (only sequential
+;;; consistency == 6 is currently allowed).
+
+define i32 @error_atomic_load_8(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i8*
+  %i = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 0)
+  %r = zext i8 %i to i32
+  ret i32 %r
+}
+; CHECK: Unexpected memory ordering for AtomicLoad
+
+define i32 @error_atomic_load_16(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i16*
+  %i = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 1)
+  %r = zext i16 %i to i32
+  ret i32 %r
+}
+; CHECK: Unexpected memory ordering for AtomicLoad
+
+define i64 @error_atomic_load_64(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %r = call i64 @llvm.nacl.atomic.load.i64(i64* %ptr, i32 2)
+  ret i64 %r
+}
+; CHECK: Unexpected memory ordering for AtomicLoad
+
+
+;;; Store
+
+define void @error_atomic_store_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  call void @llvm.nacl.atomic.store.i32(i32 %v, i32* %ptr, i32 2)
+  ret void
+}
+; CHECK: Unexpected memory ordering for AtomicStore
+
+define void @error_atomic_store_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  call void @llvm.nacl.atomic.store.i64(i64 %v, i64* %ptr, i32 3)
+  ret void
+}
+; CHECK: Unexpected memory ordering for AtomicStore
+
+define void @error_atomic_store_64_const(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  call void @llvm.nacl.atomic.store.i64(i64 12345678901234, i64* %ptr, i32 4)
+  ret void
+}
+; CHECK: Unexpected memory ordering for AtomicStore
+
+;;; RMW
+;;; Test atomic memory order and operation.
+
+define i32 @error_atomic_rmw_add_8(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i8
+  %ptr = inttoptr i32 %iptr to i8*
+  %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 1, i8* %ptr, i8 %trunc, i32 5)
+  %a_ext = zext i8 %a to i32
+  ret i32 %a_ext
+}
+; CHECK: Unexpected memory ordering for AtomicRMW
+
+define i64 @error_atomic_rmw_add_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 4)
+  ret i64 %a
+}
+; CHECK: Unexpected memory ordering for AtomicRMW
+
+define i32 @error_atomic_rmw_add_16(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i16
+  %ptr = inttoptr i32 %iptr to i16*
+  %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 0, i16* %ptr, i16 %trunc, i32 6)
+  %a_ext = zext i16 %a to i32
+  ret i32 %a_ext
+}
+; CHECK: Unknown AtomicRMW operation
+
+define i32 @error_atomic_rmw_add_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 7, i32* %ptr, i32 %v, i32 6)
+  ret i32 %a
+}
+; CHECK: Unknown AtomicRMW operation
+
+define i32 @error_atomic_rmw_add_32_max(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 4294967295, i32* %ptr, i32 %v, i32 6)
+  ret i32 %a
+}
+; CHECK: Unknown AtomicRMW operation
+
+;;; Cmpxchg
+
+define i32 @error_atomic_cmpxchg_32_success(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
+                                               i32 %desired, i32 0, i32 6)
+  ret i32 %old
+}
+; CHECK: Unexpected memory ordering (success) for AtomicCmpxchg
+
+define i32 @error_atomic_cmpxchg_32_failure(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
+                                               i32 %desired, i32 6, i32 0)
+  ret i32 %old
+}
+; CHECK: Unexpected memory ordering (failure) for AtomicCmpxchg
+
+define i64 @error_atomic_cmpxchg_64_failure(i32 %iptr, i64 %expected, i64 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+                                               i64 %desired, i32 6, i32 3)
+  ret i64 %old
+}
+; CHECK: Unexpected memory ordering (failure) for AtomicCmpxchg
+
+;;; Fence and is-lock-free.
+
+define void @error_atomic_fence() {
+entry:
+  call void @llvm.nacl.atomic.fence(i32 1)
+  ret void
+}
+; CHECK: Unexpected memory ordering for AtomicFence
+
+define i32 @error_atomic_is_lock_free_var(i32 %iptr, i32 %bs) {
+entry:
+  %ptr = inttoptr i32 %iptr to i8*
+  %i = call i1 @llvm.nacl.atomic.is.lock.free(i32 %bs, i8* %ptr)
+  %r = zext i1 %i to i32
+  ret i32 %r
+}
+; CHECK: AtomicIsLockFree byte size should be compile-time const
--- a/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
+; Test that loads/stores don't move across a nacl.atomic.fence.all.
+; This should apply to both atomic and non-atomic loads/stores
+; (unlike the non-"all" variety of nacl.atomic.fence, which only
+; applies to atomic load/stores).
+;
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+
+declare void @llvm.nacl.atomic.fence.all()
+declare i32 @llvm.nacl.atomic.load.i32(i32*, i32)
+declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
+
+@g32_a = internal global [4 x i8] zeroinitializer, align 4
+@g32_b = internal global [4 x i8] zeroinitializer, align 4
+@g32_c = internal global [4 x i8] zeroinitializer, align 4
+@g32_d = internal global [4 x i8] c"\02\00\00\00", align 4
+
+define i32 @test_fused_load_add_a() {
+entry:
+  %p_alloca = alloca i8, i32 4, align 4
+  %p_alloca_bc = bitcast i8* %p_alloca to i32*
+  store i32 999, i32* %p_alloca_bc, align 1
+
+  %p_a = bitcast [4 x i8]* @g32_a to i32*
+  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
+  %l_a2 = add i32 %l_a, 1
+  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
+
+  %p_b = bitcast [4 x i8]* @g32_b to i32*
+  %l_b = load i32* %p_b
+  %l_b2 = add i32 %l_b, 1
+  store i32 %l_b2, i32* %p_b, align 1
+
+  %p_c = bitcast [4 x i8]* @g32_c to i32*
+  %l_c = load i32* %p_c
+  %l_c2 = add i32 %l_c, 1
+  call void @llvm.nacl.atomic.fence.all()
+  store i32 %l_c2, i32* %p_c, align 1
+
+  ret i32 %l_c2
+}
+; CHECK-LABEL: test_fused_load_add_a
+;    alloca store
+; CHECK: mov {{.*}}, esp
+; CHECK: mov dword ptr {{.*}}, 999
+;    atomic store (w/ its own mfence)
+; CHECK: mov {{.*}}, g32_a
+; The load + add are optimized into one everywhere.
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+; CHECK: mfence
+; CHECK: mov {{.*}}, g32_b
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+; CHECK: mov {{.*}}, g32_c
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mfence
+; CHECK: mov dword ptr
+
+; Test with the fence moved up a bit.
+define i32 @test_fused_load_add_b() {
+entry:
+  %p_alloca = alloca i8, i32 4, align 4
+  %p_alloca_bc = bitcast i8* %p_alloca to i32*
+  store i32 999, i32* %p_alloca_bc, align 1
+
+  %p_a = bitcast [4 x i8]* @g32_a to i32*
+  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
+  %l_a2 = add i32 %l_a, 1
+  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
+
+  %p_b = bitcast [4 x i8]* @g32_b to i32*
+  %l_b = load i32* %p_b
+  %l_b2 = add i32 %l_b, 1
+  store i32 %l_b2, i32* %p_b, align 1
+
+  %p_c = bitcast [4 x i8]* @g32_c to i32*
+  call void @llvm.nacl.atomic.fence.all()
+  %l_c = load i32* %p_c
+  %l_c2 = add i32 %l_c, 1
+  store i32 %l_c2, i32* %p_c, align 1
+
+  ret i32 %l_c2
+}
+; CHECK-LABEL: test_fused_load_add_b
+;    alloca store
+; CHECK: mov {{.*}}, esp
+; CHECK: mov dword ptr {{.*}}, 999
+;    atomic store (w/ its own mfence)
+; CHECK: mov {{.*}}, g32_a
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+; CHECK: mfence
+; CHECK: mov {{.*}}, g32_b
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+; CHECK: mov {{.*}}, g32_c
+; CHECK: mfence
+; Load + add can still be optimized into one instruction
+; because it is not separated by a fence.
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+
+; Test with the fence splitting a load/add.
+define i32 @test_fused_load_add_c() {
+entry:
+  %p_alloca = alloca i8, i32 4, align 4
+  %p_alloca_bc = bitcast i8* %p_alloca to i32*
+  store i32 999, i32* %p_alloca_bc, align 1
+
+  %p_a = bitcast [4 x i8]* @g32_a to i32*
+  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
+  %l_a2 = add i32 %l_a, 1
+  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
+
+  %p_b = bitcast [4 x i8]* @g32_b to i32*
+  %l_b = load i32* %p_b
+  call void @llvm.nacl.atomic.fence.all()
+  %l_b2 = add i32 %l_b, 1
+  store i32 %l_b2, i32* %p_b, align 1
+
+  %p_c = bitcast [4 x i8]* @g32_c to i32*
+  %l_c = load i32* %p_c
+  %l_c2 = add i32 %l_c, 1
+  store i32 %l_c2, i32* %p_c, align 1
+
+  ret i32 %l_c2
+}
+; CHECK-LABEL: test_fused_load_add_c
+;    alloca store
+; CHECK: mov {{.*}}, esp
+; CHECK: mov dword ptr {{.*}}, 999
+;    atomic store (w/ its own mfence)
+; CHECK: mov {{.*}}, g32_a
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+; CHECK: mfence
+; CHECK: mov {{.*}}, g32_b
+; This load + add are no longer optimized into one,
+; though perhaps it should be legal as long as
+; the load stays on the same side of the fence.
+; CHECK: mov {{.*}}, dword ptr
+; CHECK: mfence
+; CHECK: add {{.*}}, 1
+; CHECK: mov dword ptr
+; CHECK: mov {{.*}}, g32_c
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+
+
+; Test where a bunch of i8 loads could have been fused into one
+; i32 load, but a fence blocks that.
+define i32 @could_have_fused_loads() {
+entry:
+  %ptr1 = bitcast [4 x i8]* @g32_d to i8*
+  %b1 = load i8* %ptr1
+
+  %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32
+  %int_ptr_bump2 = add i32 %int_ptr2, 1
+  %ptr2 = inttoptr i32 %int_ptr_bump2 to i8*
+  %b2 = load i8* %ptr2
+
+  %int_ptr_bump3 = add i32 %int_ptr2, 2
+  %ptr3 = inttoptr i32 %int_ptr_bump3 to i8*
+  %b3 = load i8* %ptr3
+
+  call void @llvm.nacl.atomic.fence.all()
+
+  %int_ptr_bump4 = add i32 %int_ptr2, 3
+  %ptr4 = inttoptr i32 %int_ptr_bump4 to i8*
+  %b4 = load i8* %ptr4
+
+  %b1.ext = zext i8 %b1 to i32
+  %b2.ext = zext i8 %b2 to i32
+  %b2.shift = shl i32 %b2.ext, 8
+  %b12 = or i32 %b1.ext, %b2.shift
+  %b3.ext = zext i8 %b3 to i32
+  %b3.shift = shl i32 %b3.ext, 16
+  %b123 = or i32 %b12, %b3.shift
+  %b4.ext = zext i8 %b4 to i32
+  %b4.shift = shl i32 %b4.ext, 24
+  %b1234 = or i32 %b123, %b4.shift
+  ret i32 %b1234
+}
+; CHECK-LABEL: could_have_fused_loads
+; CHECK: mov {{.*}}, g32_d
+; CHECK: mov {{.*}}, byte ptr
+; CHECK: mov {{.*}}, byte ptr
+; CHECK: mov {{.*}}, byte ptr
+; CHECK: mfence
+; CHECK: mov {{.*}}, byte ptr
+
+
+; Test where an identical load from two branches could have been hoisted
+; up, and then the code merged, but a fence prevents it.
+define i32 @could_have_hoisted_loads(i32 %x) {
+entry:
+  %ptr = bitcast [4 x i8]* @g32_d to i32*
+  %cmp = icmp eq i32 %x, 1
+  br i1 %cmp, label %branch1, label %branch2
+branch1:
+  %y = load i32* %ptr
+  ret i32 %y
+branch2:
+  call void @llvm.nacl.atomic.fence.all()
+  %z = load i32* %ptr
+  ret i32 %z
+}
+; CHECK-LABEL: could_have_hoisted_loads
+; CHECK: mov {{.*}}, g32_d
+; CHECK: je {{.*}}
+; CHECK: jmp {{.*}}
+; CHECK: mov {{.*}}, dword ptr
+; CHECK: ret
+; CHECK: mfence
+; CHECK: mov {{.*}}, dword ptr
+; CHECK: ret
--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll