emitIAS for icmp, and test, movss-reg, movq, movups, storep, storeq, tighten some of the Xmm ops

The "test" instruction is used in very limited situations. I've made a best effort to fill in the possible forms (address for the first operand), but it's not tested, so I put the *untested* parts behind an assert. Otherwise it's very similar to icmp, so if it starts to be used and tested then the asserts can be taken out, and the code shared with icmp. Tighten some of the XMM dispatch/emitters. Most of those XMM instructions can only encode the variant where dest is a register. Rather than waste a slot for a NULL method pointer, just make the struct type have two variants instead of three. Fill out a couple of XMM instructions which *do* allow mem-ops as dest (mov instructions). BUG=none R=stichnot@chromium.org Review URL: https://codereview.chromium.org/624263002

emitIAS for icmp, and test, movss-reg, movq, movups, storep, storeq, tighten some of the Xmm ops
e4dc61bf · Jan Voung · df861f73 · e4dc61bf · e4dc61bf · e4dc61bf
Commit e4dc61bf authored Oct 06, 2014 by Jan Voung
9 changed files
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -200,7 +200,7 @@ public:
    Movd,
    Movp,
    Movq,
-    Movss,
+    MovssRegs,
    Movsx,
    Movzx,
    Mul,
@@ -521,9 +521,9 @@ private:
  static const x86::AssemblerX86::GPREmitterRegOp Emitter;
 };
-void emitIASVarOperandTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
+void emitIASRegOpTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
-                            const Operand *Src,
+                       const Operand *Src,
-                            const x86::AssemblerX86::XmmEmitterTwoOps &Emitter);
+                       const x86::AssemblerX86::XmmEmitterRegOp &Emitter);
 template <InstX8632::InstKindX8632 K>
 class InstX8632UnaryopXmm : public InstX8632 {
@@ -544,7 +544,7 @@ public:
  void emitIAS(const Cfg *Func) const override {
    Type Ty = getDest()->getType();
    assert(getSrcSize() == 1);
-    emitIASVarOperandTyXMM(Func, Ty, getDest(), getSrc(0), Emitter);
+    emitIASRegOpTyXMM(Func, Ty, getDest(), getSrc(0), Emitter);
  }
  void dump(const Cfg *Func) const override {
    Ostream &Str = Func->getContext()->getStrDump();
@@ -563,7 +563,7 @@ private:
  InstX8632UnaryopXmm &operator=(const InstX8632UnaryopXmm &) = delete;
  ~InstX8632UnaryopXmm() override {}
  static const char *Opcode;
-  static const x86::AssemblerX86::XmmEmitterTwoOps Emitter;
+  static const x86::AssemblerX86::XmmEmitterRegOp Emitter;
 };
 // See the definition of emitTwoAddress() for a description of
@@ -571,37 +571,6 @@ private:
 void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
                    bool ShiftHack = false);
-template <InstX8632::InstKindX8632 K> class InstX8632Binop : public InstX8632 {
-public:
-  // Create a binary-op instruction (not yet migrated to integrated assembler)
-  static InstX8632Binop *create(Cfg *Func, Variable *Dest, Operand *Source) {
-    return new (Func->allocate<InstX8632Binop>())
-        InstX8632Binop(Func, Dest, Source);
-  }
-  void emit(const Cfg *Func) const override {
-    const bool ShiftHack = false;
-    emitTwoAddress(Opcode, this, Func, ShiftHack);
-  }
-  void dump(const Cfg *Func) const override {
-    Ostream &Str = Func->getContext()->getStrDump();
-    dumpDest(Func);
-    Str << " = " << Opcode << "." << getDest()->getType() << " ";
-    dumpSources(Func);
-  }
-  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
-private:
-  InstX8632Binop(Cfg *Func, Variable *Dest, Operand *Source)
-      : InstX8632(Func, K, 2, Dest) {
-    addSource(Dest);
-    addSource(Source);
-  }
-  InstX8632Binop(const InstX8632Binop &) = delete;
-  InstX8632Binop &operator=(const InstX8632Binop &) = delete;
-  ~InstX8632Binop() override {}
-  static const char *Opcode;
-};
 void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
                     const Operand *Src,
                     const x86::AssemblerX86::GPREmitterShiftOp &Emitter);
@@ -700,7 +669,7 @@ public:
    if (NeedsElementType)
      Ty = typeElementType(Ty);
    assert(getSrcSize() == 2);
-    emitIASVarOperandTyXMM(Func, Ty, getDest(), getSrc(1), Emitter);
+    emitIASRegOpTyXMM(Func, Ty, getDest(), getSrc(1), Emitter);
  }
  void dump(const Cfg *Func) const override {
    Ostream &Str = Func->getContext()->getStrDump();
@@ -720,7 +689,7 @@ private:
  InstX8632BinopXmm &operator=(const InstX8632BinopXmm &) = delete;
  ~InstX8632BinopXmm() override {}
  static const char *Opcode;
-  static const x86::AssemblerX86::XmmEmitterTwoOps Emitter;
+  static const x86::AssemblerX86::XmmEmitterRegOp Emitter;
 };
 void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
@@ -866,6 +835,7 @@ public:
  }
  bool isSimpleAssign() const override { return true; }
  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override { emit(Func); }
  void dump(const Cfg *Func) const override {
    Ostream &Str = Func->getContext()->getStrDump();
    Str << Opcode << "." << getDest()->getType() << " ";
@@ -935,13 +905,14 @@ typedef InstX8632BinopGPRShift<InstX8632::Sar> InstX8632Sar;
 typedef InstX8632BinopXmmShift<InstX8632::Psra> InstX8632Psra;
 typedef InstX8632BinopXmm<InstX8632::Pcmpeq, true> InstX8632Pcmpeq;
 typedef InstX8632BinopXmm<InstX8632::Pcmpgt, true> InstX8632Pcmpgt;
-// TODO: movss is only a binary operation when the source and dest
+// movss is only a binary operation when the source and dest
-// operands are both registers.  In other cases, it behaves like a copy
+// operands are both registers (the high bits of dest are left untouched).
-// (mov-like) operation.  Eventually, InstX8632Movss should assert that
+// In other cases, it behaves like a copy (mov-like) operation (and the
-// both its source and dest operands are registers, and the lowering
+// high bits of dest are cleared).
-// code should use _mov instead of _movss in cases where a copy
+// InstX8632Movss will assert that both its source and dest operands are
-// operation is intended.
+// registers, so the lowering code should use _mov instead of _movss
-typedef InstX8632Binop<InstX8632::Movss> InstX8632Movss;
+// in cases where a copy operation is intended.
+typedef InstX8632BinopXmm<InstX8632::MovssRegs, false> InstX8632MovssRegs;
 typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
 typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
 typedef InstX8632Ternop<InstX8632::Insertps> InstX8632Insertps;
@@ -1163,6 +1134,7 @@ public:
        InstX8632Icmp(Func, Src1, Src2);
  }
  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
  void dump(const Cfg *Func) const override;
  static bool classof(const Inst *Inst) { return isClassof(Inst, Icmp); }
@@ -1199,6 +1171,7 @@ public:
    return new (Func->allocate<InstX8632UD2>()) InstX8632UD2(Func);
  }
  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
  void dump(const Cfg *Func) const override;
  static bool classof(const Inst *Inst) { return isClassof(Inst, UD2); }
@@ -1217,6 +1190,7 @@ public:
        InstX8632Test(Func, Source1, Source2);
  }
  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
  void dump(const Cfg *Func) const override;
  static bool classof(const Inst *Inst) { return isClassof(Inst, Test); }
@@ -1265,38 +1239,43 @@ private:
  ~InstX8632Store() override {}
 };
+// This is essentially a vector "mov" instruction with an OperandX8632Mem
+// operand instead of Variable as the destination.  It's important
+// for liveness that there is no Dest operand. The source must be an
+// Xmm register, since Dest is mem.
 class InstX8632StoreP : public InstX8632 {
 public:
-  static InstX8632StoreP *create(Cfg *Func, Operand *Value, OperandX8632 *Mem) {
+  static InstX8632StoreP *create(Cfg *Func, Variable *Value,
+                                 OperandX8632Mem *Mem) {
    return new (Func->allocate<InstX8632StoreP>())
        InstX8632StoreP(Func, Value, Mem);
  }
  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
  void dump(const Cfg *Func) const override;
  static bool classof(const Inst *Inst) { return isClassof(Inst, StoreP); }
 private:
-  InstX8632StoreP(Cfg *Func, Operand *Value, OperandX8632 *Mem);
+  InstX8632StoreP(Cfg *Func, Variable *Value, OperandX8632Mem *Mem);
  InstX8632StoreP(const InstX8632StoreP &) = delete;
  InstX8632StoreP &operator=(const InstX8632StoreP &) = delete;
  ~InstX8632StoreP() override {}
 };
-// This is essentially a "movq" instruction with an OperandX8632Mem
-// operand instead of Variable as the destination.  It's important
-// for liveness that there is no Dest operand.
 class InstX8632StoreQ : public InstX8632 {
 public:
-  static InstX8632StoreQ *create(Cfg *Func, Operand *Value, OperandX8632 *Mem) {
+  static InstX8632StoreQ *create(Cfg *Func, Variable *Value,
+                                 OperandX8632Mem *Mem) {
    return new (Func->allocate<InstX8632StoreQ>())
        InstX8632StoreQ(Func, Value, Mem);
  }
  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
  void dump(const Cfg *Func) const override;
  static bool classof(const Inst *Inst) { return isClassof(Inst, StoreQ); }
 private:
-  InstX8632StoreQ(Cfg *Func, Operand *Value, OperandX8632 *Mem);
+  InstX8632StoreQ(Cfg *Func, Variable *Value, OperandX8632Mem *Mem);
  InstX8632StoreQ(const InstX8632StoreQ &) = delete;
  InstX8632StoreQ &operator=(const InstX8632StoreQ &) = delete;
  ~InstX8632StoreQ() override {}
@@ -1535,6 +1514,9 @@ template <> void InstX8632Idiv::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Imul::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Cbwdq::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Movd::emitIAS(const Cfg *Func) const;
+template <> void InstX8632Movp::emitIAS(const Cfg *Func) const;
+template <> void InstX8632Movq::emitIAS(const Cfg *Func) const;
+template <> void InstX8632MovssRegs::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Pmull::emitIAS(const Cfg *Func) const;
 } // end of namespace Ice

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -308,8 +308,8 @@ protected:
  void _movq(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Movq::create(Func, Dest, Src0));
  }
-  void _movss(Variable *Dest, Operand *Src0) {
+  void _movss(Variable *Dest, Variable *Src0) {
-    Context.insert(InstX8632Movss::create(Func, Dest, Src0));
+    Context.insert(InstX8632MovssRegs::create(Func, Dest, Src0));
  }
  void _movsx(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Movsx::create(Func, Dest, Src0));
@@ -422,10 +422,10 @@ protected:
  void _store(Operand *Value, OperandX8632 *Mem) {
    Context.insert(InstX8632Store::create(Func, Value, Mem));
  }
-  void _storep(Operand *Value, OperandX8632 *Mem) {
+  void _storep(Variable *Value, OperandX8632Mem *Mem) {
    Context.insert(InstX8632StoreP::create(Func, Value, Mem));
  }
-  void _storeq(Operand *Value, OperandX8632 *Mem) {
+  void _storeq(Variable *Value, OperandX8632Mem *Mem) {
    Context.insert(InstX8632StoreQ::create(Func, Value, Mem));
  }
  void _sub(Variable *Dest, Operand *Src0) {

--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -340,12 +340,20 @@ void AssemblerX86::movd(const Address &dst, XmmRegister src) {
  EmitOperand(src, dst);
 }
+void AssemblerX86::movq(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitUint8(0x0F);
+  EmitUint8(0x7E);
+  EmitRegisterOperand(dst, src);
+}
 void AssemblerX86::movq(const Address &dst, XmmRegister src) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
  EmitUint8(0x66);
  EmitUint8(0x0F);
  EmitUint8(0xD6);
-  EmitOperand(src, Operand(dst));
+  EmitOperand(src, dst);
 }
 void AssemblerX86::movq(XmmRegister dst, const Address &src) {
@@ -353,7 +361,7 @@ void AssemblerX86::movq(XmmRegister dst, const Address &src) {
  EmitUint8(0xF3);
  EmitUint8(0x0F);
  EmitUint8(0x7E);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst, src);
 }
 void AssemblerX86::addss(Type Ty, XmmRegister dst, XmmRegister src) {
@@ -463,6 +471,13 @@ void AssemblerX86::movaps(XmmRegister dst, XmmRegister src) {
  EmitXmmRegisterOperand(dst, src);
 }
+void AssemblerX86::movups(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x10);
+  EmitRegisterOperand(dst, src);
+}
 void AssemblerX86::movups(XmmRegister dst, const Address &src) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
  EmitUint8(0x0F);
@@ -1289,52 +1304,90 @@ void AssemblerX86::fincstp() {
  EmitUint8(0xF7);
 }
-void AssemblerX86::cmpl(GPRRegister reg, const Immediate &imm) {
+void AssemblerX86::cmp(Type Ty, GPRRegister reg, const Immediate &imm) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(BrokenType, 7, Operand(reg), imm);
+  if (isByteSizedType(Ty)) {
+    EmitComplexI8(7, Operand(reg), imm);
+    return;
+  }
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  EmitComplex(Ty, 7, Operand(reg), imm);
 }
-void AssemblerX86::cmpl(GPRRegister reg0, GPRRegister reg1) {
+void AssemblerX86::cmp(Type Ty, GPRRegister reg0, GPRRegister reg1) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x3B);
+  if (Ty == IceType_i16)
-  EmitOperand(reg0, Operand(reg1));
+    EmitOperandSizeOverride();
+  if (isByteSizedType(Ty))
+    EmitUint8(0x3A);
+  else
+    EmitUint8(0x3B);
+  EmitRegisterOperand(reg0, reg1);
 }
-void AssemblerX86::cmpl(GPRRegister reg, const Address &address) {
+void AssemblerX86::cmp(Type Ty, GPRRegister reg, const Address &address) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x3B);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (isByteSizedType(Ty))
+    EmitUint8(0x3A);
+  else
+    EmitUint8(0x3B);
  EmitOperand(reg, address);
 }
-void AssemblerX86::cmpl(const Address &address, GPRRegister reg) {
+void AssemblerX86::cmp(Type Ty, const Address &address, GPRRegister reg) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x39);
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  if (isByteSizedType(Ty))
+    EmitUint8(0x38);
+  else
+    EmitUint8(0x39);
  EmitOperand(reg, address);
 }
-void AssemblerX86::cmpl(const Address &address, const Immediate &imm) {
+void AssemblerX86::cmp(Type Ty, const Address &address, const Immediate &imm) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(BrokenType, 7, address, imm);
+  if (isByteSizedType(Ty)) {
+    EmitComplexI8(7, address, imm);
+    return;
+  }
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  EmitComplex(Ty, 7, address, imm);
 }
-void AssemblerX86::cmpb(const Address &address, const Immediate &imm) {
+void AssemblerX86::test(Type Ty, GPRRegister reg1, GPRRegister reg2) {
-  assert(imm.is_int8());
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x80);
+  if (Ty == IceType_i16)
-  EmitOperand(7, address);
+    EmitOperandSizeOverride();
-  EmitUint8(imm.value() & 0xFF);
+  if (isByteSizedType(Ty))
+    EmitUint8(0x84);
+  else
+    EmitUint8(0x85);
+  EmitRegisterOperand(reg1, reg2);
 }
-void AssemblerX86::testl(GPRRegister reg1, GPRRegister reg2) {
+void AssemblerX86::test(Type Ty, const Address &addr, GPRRegister reg) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x85);
+  if (Ty == IceType_i16)
-  EmitRegisterOperand(reg1, reg2);
+    EmitOperandSizeOverride();
+  if (isByteSizedType(Ty))
+    EmitUint8(0x84);
+  else
+    EmitUint8(0x85);
+  EmitOperand(reg, addr);
 }
-void AssemblerX86::testl(GPRRegister reg, const Immediate &immediate) {
+void AssemblerX86::test(Type Ty, GPRRegister reg, const Immediate &immediate) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
  // For registers that have a byte variant (EAX, EBX, ECX, and EDX)
  // we only test the byte register to keep the encoding short.
+  // This is legal even if the register had high bits set since
+  // this only sets flags registers based on the "AND" of the two operands,
+  // and the immediate had zeros at those high bits.
  if (immediate.is_uint8() && reg < 4) {
    // Use zero-extended 8-bit immediate.
    if (reg == RegX8632::Encoded_Reg_eax) {
@@ -1346,12 +1399,35 @@ void AssemblerX86::testl(GPRRegister reg, const Immediate &immediate) {
    EmitUint8(immediate.value() & 0xFF);
  } else if (reg == RegX8632::Encoded_Reg_eax) {
    // Use short form if the destination is EAX.
+    if (Ty == IceType_i16)
+      EmitOperandSizeOverride();
    EmitUint8(0xA9);
-    EmitImmediate(BrokenType, immediate);
+    EmitImmediate(Ty, immediate);
+  } else {
+    if (Ty == IceType_i16)
+      EmitOperandSizeOverride();
+    EmitUint8(0xF7);
+    EmitRegisterOperand(0, reg);
+    EmitImmediate(Ty, immediate);
+  }
+}
+void AssemblerX86::test(Type Ty, const Address &addr,
+                        const Immediate &immediate) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  // If the immediate is short, we only test the byte addr to keep the
+  // encoding short.
+  if (immediate.is_uint8()) {
+    // Use zero-extended 8-bit immediate.
+    EmitUint8(0xF6);
+    EmitOperand(0, addr);
+    EmitUint8(immediate.value() & 0xFF);
  } else {
+    if (Ty == IceType_i16)
+      EmitOperandSizeOverride();
    EmitUint8(0xF7);
-    EmitOperand(0, Operand(reg));
+    EmitOperand(0, addr);
-    EmitImmediate(BrokenType, immediate);
+    EmitImmediate(Ty, immediate);
  }
 }
@@ -2013,6 +2089,12 @@ void AssemblerX86::hlt() {
  EmitUint8(0xF4);
 }
+void AssemblerX86::ud2() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x0B);
+}
 void AssemblerX86::j(CondX86::BrCond condition, Label *label, bool near) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
  if (label->IsBound()) {

--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -361,16 +361,31 @@ public:
    // In practice, we always normalize the Dest to a Register first.
  };
+  typedef void (AssemblerX86::*TypedEmitAddrGPR)(Type, const Address &,
+                                                 GPRRegister);
+  typedef void (AssemblerX86::*TypedEmitAddrImm)(Type, const Address &,
+                                                 const Immediate &);
+  struct GPREmitterAddrOp {
+    TypedEmitAddrGPR AddrGPR;
+    TypedEmitAddrImm AddrImm;
+  };
  // Operations to emit XMM instructions (and dispatch on operand type).
  typedef void (AssemblerX86::*TypedEmitXmmXmm)(Type, XmmRegister, XmmRegister);
  typedef void (AssemblerX86::*TypedEmitXmmAddr)(Type, XmmRegister,
                                                 const Address &);
-  typedef void (AssemblerX86::*TypedEmitAddrXmm)(Type, const Address &,
+  struct XmmEmitterRegOp {
-                                                 XmmRegister);
-  struct XmmEmitterTwoOps {
    TypedEmitXmmXmm XmmXmm;
    TypedEmitXmmAddr XmmAddr;
-    TypedEmitAddrXmm AddrXmm;
+  };
+  typedef void (AssemblerX86::*EmitXmmXmm)(XmmRegister, XmmRegister);
+  typedef void (AssemblerX86::*EmitXmmAddr)(XmmRegister, const Address &);
+  typedef void (AssemblerX86::*EmitAddrXmm)(const Address &, XmmRegister);
+  struct XmmEmitterMovOps {
+    EmitXmmXmm XmmXmm;
+    EmitXmmAddr XmmAddr;
+    EmitAddrXmm AddrXmm;
  };
  typedef void (AssemblerX86::*TypedEmitXmmImm)(Type, XmmRegister,
@@ -442,6 +457,7 @@ public:
  void movd(GPRRegister dst, XmmRegister src);
  void movd(const Address &dst, XmmRegister src);
+  void movq(XmmRegister dst, XmmRegister src);
  void movq(const Address &dst, XmmRegister src);
  void movq(XmmRegister dst, const Address &src);
@@ -460,6 +476,7 @@ public:
  void movaps(XmmRegister dst, XmmRegister src);
+  void movups(XmmRegister dst, XmmRegister src);
  void movups(XmmRegister dst, const Address &src);
  void movups(const Address &dst, XmmRegister src);
@@ -591,15 +608,16 @@ public:
  void fincstp();
-  void cmpl(GPRRegister reg, const Immediate &imm);
+  void cmp(Type Ty, GPRRegister reg0, GPRRegister reg1);
-  void cmpl(GPRRegister reg0, GPRRegister reg1);
+  void cmp(Type Ty, GPRRegister reg, const Address &address);
-  void cmpl(GPRRegister reg, const Address &address);
+  void cmp(Type Ty, GPRRegister reg, const Immediate &imm);
-  void cmpl(const Address &address, GPRRegister reg);
+  void cmp(Type Ty, const Address &address, GPRRegister reg);
-  void cmpl(const Address &address, const Immediate &imm);
+  void cmp(Type Ty, const Address &address, const Immediate &imm);
-  void cmpb(const Address &address, const Immediate &imm);
-  void testl(GPRRegister reg1, GPRRegister reg2);
+  void test(Type Ty, GPRRegister reg0, GPRRegister reg1);
-  void testl(GPRRegister reg, const Immediate &imm);
+  void test(Type Ty, GPRRegister reg, const Immediate &imm);
+  void test(Type Ty, const Address &address, GPRRegister reg);
+  void test(Type Ty, const Address &address, const Immediate &imm);
  void And(Type Ty, GPRRegister dst, GPRRegister src);
  void And(Type Ty, GPRRegister dst, const Address &address);
@@ -698,6 +716,7 @@ public:
  void nop(int size = 1);
  void int3();
  void hlt();
+  void ud2();
  void j(CondX86::BrCond condition, Label *label, bool near = kFarJump);
  void j(CondX86::BrCond condition, const ConstantRelocatable *label);

--- a/tests_lit/assembler/x86/immediate_encodings.ll
+++ b/tests_lit/assembler/x86/immediate_encodings.ll
@@ -276,5 +276,40 @@ entry:
 ; CHECK-LABEL: testShl16Imm1
 ; CHECK: 66 d1 e0 shl ax
+; Currently the "test" instruction is used for 64-bit shifts, and
+; for ctlz 64-bit, so we use those to test the "test" instruction.
+; One optimization for "test": the "test" instruction is essentially a
+; bitwise AND that doesn't modify the two source operands, so for immediates
+; under 8-bits and registers with 8-bit variants we can use the shorter form.
+define internal i64 @test_via_shl64Bit(i64 %a, i64 %b) {
+entry:
+  %shl = shl i64 %a, %b
+  ret i64 %shl
+}
+; CHECK-LABEL: test_via_shl64Bit
+; CHECK: 0f a5 c2  shld edx, eax, cl
+; CHECK: d3 e0     shl eax, cl
+; CHECK: f6 c1 20  test cl, 32
+; Test a few register encodings of "test".
+declare i64 @llvm.ctlz.i64(i64, i1)
+define i64 @test_via_ctlz_64(i64 %x, i64 %y, i64 %z, i64 %w) {
+entry:
+  %r = call i64 @llvm.ctlz.i64(i64 %x, i1 false)
+  %r2 = call i64 @llvm.ctlz.i64(i64 %y, i1 false)
+  %r3 = call i64 @llvm.ctlz.i64(i64 %z, i1 false)
+  %r4 = call i64 @llvm.ctlz.i64(i64 %w, i1 false)
+  %res1 = add i64 %r, %r2
+  %res2 = add i64 %r3, %r4
+  %res = add i64 %res1, %res2
+  ret i64 %res
+}
+; CHECK-LABEL: test_via_ctlz_64
+; CHECK-DAG: 85 c0 test eax, eax
+; CHECK-DAG: 85 db test ebx, ebx
+; CHECK-DAG: 85 f6 test esi, esi
 ; ERRORS-NOT: ICE translation error
 ; DUMP-NOT: SZ
--- a/tests_lit/assembler/x86/opcode_register_encodings.ll
+++ b/tests_lit/assembler/x86/opcode_register_encodings.ll
@@ -6,7 +6,6 @@
 ; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
 ; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
 ; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
-; RUN: %p2i -i %s --insts | %szdiff %s | FileCheck --check-prefix=DUMP %s
 define <8 x i16> @test_mul_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
 entry:
@@ -83,5 +82,49 @@ entry:
 ; CHECK-DAG: 66 0f 38 40 8c 24 80 00 00 00 pmulld xmm1, xmmword ptr [esp + 128]
 }
+; Test movq, which is used by atomic stores.
+declare void @llvm.nacl.atomic.store.i64(i64, i64*, i32)
+define void @test_atomic_store_64(i32 %iptr, i32 %iptr2, i32 %iptr3, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %ptr2 = inttoptr i32 %iptr2 to i64*
+  %ptr3 = inttoptr i32 %iptr3 to i64*
+  call void @llvm.nacl.atomic.store.i64(i64 %v, i64* %ptr2, i32 6)
+  call void @llvm.nacl.atomic.store.i64(i64 1234567891024, i64* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i64(i64 %v, i64* %ptr3, i32 6)
+  ret void
+}
+; CHECK-LABEL: test_atomic_store_64
+; CHECK-DAG: f3 0f 7e 04 24    movq xmm0, qword ptr [esp]
+; CHECK-DAG: f3 0f 7e 44 24 08 movq xmm0, qword ptr [esp + 8]
+; CHECK-DAG: 66 0f d6 0{{.*}}  movq qword ptr [e{{.*}}], xmm0
+; Test "movups" via vector stores and loads.
+define void @store_v16xI8(i32 %addr, i32 %addr2, i32 %addr3, <16 x i8> %v) {
+  %addr_v16xI8 = inttoptr i32 %addr to <16 x i8>*
+  %addr2_v16xI8 = inttoptr i32 %addr2 to <16 x i8>*
+  %addr3_v16xI8 = inttoptr i32 %addr3 to <16 x i8>*
+  store <16 x i8> %v, <16 x i8>* %addr2_v16xI8, align 1
+  store <16 x i8> %v, <16 x i8>* %addr_v16xI8, align 1
+  store <16 x i8> %v, <16 x i8>* %addr3_v16xI8, align 1
+  ret void
+}
+; CHECK-LABEL: store_v16xI8
+; CHECK: 0f 11 0{{.*}} movups xmmword ptr [e{{.*}}], xmm0
+define <16 x i8> @load_v16xI8(i32 %addr, i32 %addr2, i32 %addr3) {
+  %addr_v16xI8 = inttoptr i32 %addr to <16 x i8>*
+  %addr2_v16xI8 = inttoptr i32 %addr2 to <16 x i8>*
+  %addr3_v16xI8 = inttoptr i32 %addr3 to <16 x i8>*
+  %res1 = load <16 x i8>* %addr2_v16xI8, align 1
+  %res2 = load <16 x i8>* %addr_v16xI8, align 1
+  %res3 = load <16 x i8>* %addr3_v16xI8, align 1
+  %res12 = add <16 x i8> %res1, %res2
+  %res123 = add <16 x i8> %res12, %res3
+  ret <16 x i8> %res123
+}
+; CHECK-LABEL: load_v16xI8
+; CHECK: 0f 10 0{{.*}} movups xmm0, xmmword ptr [e{{.*}}]
 ; ERRORS-NOT: ICE translation error
-; DUMP-NOT: SZ
--- a/tests_lit/llvm2ice_tests/8bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
@@ -7,7 +7,6 @@
 ; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
 ; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
 ; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
-; RUN: %p2i -i %s --insts | %szdiff %s | FileCheck --check-prefix=DUMP %s
 define internal i32 @add8Bit(i32 %a, i32 %b) {
 entry:
@@ -20,7 +19,7 @@ entry:
 ; CHECK-LABEL: add8Bit
 ; CHECK: add {{[abcd]l}}
-define internal i32 @add8BitConst(i32 %a, i32 %b) {
+define internal i32 @add8BitConst(i32 %a) {
 entry:
  %a_8 = trunc i32 %a to i8
  %add = add i8 %a_8, 123
@@ -41,7 +40,7 @@ entry:
 ; CHECK-LABEL: sub8Bit
 ; XCHECK: sub {{[abcd]l}}
-define internal i32 @sub8BitConst(i32 %a, i32 %b) {
+define internal i32 @sub8BitConst(i32 %a) {
 entry:
  %a_8 = trunc i32 %a to i8
  %sub = sub i8 %a_8, 123
@@ -62,7 +61,7 @@ entry:
 ; CHECK-LABEL: mul8Bit
 ; CHECK: mul {{[abcd]l|byte ptr}}
-define internal i32 @mul8BitConst(i32 %a, i32 %b) {
+define internal i32 @mul8BitConst(i32 %a) {
 entry:
  %a_8 = trunc i32 %a to i8
  %mul = mul i8 %a_8, 56
@@ -85,7 +84,7 @@ entry:
 ; CHECK-LABEL: udiv8Bit
 ; CHECK: div {{[abcd]l|byte ptr}}
-define internal i32 @udiv8BitConst(i32 %a, i32 %b) {
+define internal i32 @udiv8BitConst(i32 %a) {
 entry:
  %a_8 = trunc i32 %a to i8
  %udiv = udiv i8 %a_8, 123
@@ -106,7 +105,7 @@ entry:
 ; CHECK-LABEL: urem8Bit
 ; CHECK: div {{[abcd]l|byte ptr}}
-define internal i32 @urem8BitConst(i32 %a, i32 %b) {
+define internal i32 @urem8BitConst(i32 %a) {
 entry:
  %a_8 = trunc i32 %a to i8
  %urem = urem i8 %a_8, 123
@@ -128,7 +127,7 @@ entry:
 ; CHECK-LABEL: sdiv8Bit
 ; CHECK: idiv {{[abcd]l|byte ptr}}
-define internal i32 @sdiv8BitConst(i32 %a, i32 %b) {
+define internal i32 @sdiv8BitConst(i32 %a) {
 entry:
  %a_8 = trunc i32 %a to i8
  %sdiv = sdiv i8 %a_8, 123
@@ -149,7 +148,7 @@ entry:
 ; CHECK-LABEL: srem8Bit
 ; CHECK: idiv {{[abcd]l|byte ptr}}
-define internal i32 @srem8BitConst(i32 %a, i32 %b) {
+define internal i32 @srem8BitConst(i32 %a) {
 entry:
  %a_8 = trunc i32 %a to i8
  %srem = srem i8 %a_8, 123
@@ -222,6 +221,60 @@ entry:
 ; CHECK-LABEL: ashr8BitConst
 ; CHECK: sar {{[abcd]l|byte ptr}}, 6
+define internal i32 @icmp8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %icmp = icmp ne i8 %b_8, %a_8
+  %ret = zext i1 %icmp to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: icmp8Bit
+; CHECK: cmp {{[abcd]l|byte ptr}}
+define internal i32 @icmp8BitConst(i32 %a) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %icmp = icmp ne i8 %a_8, 123
+  %ret = zext i1 %icmp to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: icmp8BitConst
+; CHECK: cmp {{[abcd]l|byte ptr}}
+define internal i32 @icmp8BitConstSwapped(i32 %a) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %icmp = icmp ne i8 123, %a_8
+  %ret = zext i1 %icmp to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: icmp8BitConstSwapped
+; CHECK: cmp {{[abcd]l|byte ptr}}
+define internal i32 @icmp8BitMem(i32 %a, i32 %b_iptr) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %bptr = inttoptr i32 %b_iptr to i8*
+  %b_8 = load i8* %bptr, align 1
+  %icmp = icmp ne i8 %b_8, %a_8
+  %ret = zext i1 %icmp to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: icmp8BitMem
+; CHECK: cmp {{[abcd]l|byte ptr}}
+define internal i32 @icmp8BitMemSwapped(i32 %a, i32 %b_iptr) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %bptr = inttoptr i32 %b_iptr to i8*
+  %b_8 = load i8* %bptr, align 1
+  %icmp = icmp ne i8 %a_8, %b_8
+  %ret = zext i1 %icmp to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: icmp8BitMemSwapped
+; CHECK: cmp {{[abcd]l|byte ptr}}
 ; ERRORS-NOT: ICE translation error
 ; DUMP-NOT: SZ
--- a/tests_lit/llvm2ice_tests/vector-arg.ll
+++ b/tests_lit/llvm2ice_tests/vector-arg.ll
@@ -171,7 +171,6 @@ entry:
 ; CHECK: movups  xmm3, xmmword ptr [esp + 80]
 ; CHECK: call -4
 ; CHECK-NEXT: add esp, 32
-; CHECK: ret
 ; OPTM1-LABEL: test_passing_vectors:
 ; OPTM1: sub esp, 32
@@ -185,7 +184,6 @@ entry:
 ; OPTM1: movups  xmm3, xmmword ptr {{.*}}
 ; OPTM1: call -4
 ; OPTM1-NEXT: add esp, 32
-; OPTM1: ret
 }
 declare void @InterspersedVectorArgs(<4 x float>, i64, <4 x float>, i64, <4 x float>, float, <4 x float>, double, <4 x float>, i32, <4 x float>)