Handle GPR and vector shift ops. Handle pmull also.

For the integer shift ops, since the Src1 operand is forced to be an immediate or register (cl), it should be legal to have Dest+Src0 be either register or memory. However, we are currently only using the register form. It might be the case that shift w/ Dest+Src0 as mem are less optimized on some micro-architectures though, since it has to load, shift, and store all in one operation, but I'm not sure. BUG=none R=stichnot@chromium.org Review URL: https://codereview.chromium.org/622113002

Handle GPR and vector shift ops. Handle pmull also.
8bcca041 · Jan Voung · 541ba667 · 8bcca041 · 8bcca041 · 8bcca041
Commit 8bcca041 authored Oct 03, 2014 by Jan Voung
10 changed files
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -536,6 +536,65 @@ void emitIASRegOpTyGPR(const Cfg *Func, Type Ty, const Variable *Var,
  emitIASBytes(Str, Asm, StartPosition);
 }

+void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src,
+                     const x86::AssemblerX86::GPREmitterShiftOp &Emitter) {
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  // Technically, the Dest Var can be mem as well, but we only use Reg.
+  // We can extend this to check Dest if we decide to use that form.
+  assert(Var->hasReg());
+  // We cheat a little and use GPRRegister even for byte operations.
+  RegX8632::GPRRegister VarReg =
+      RegX8632::getEncodedByteRegOrGPR(Ty, Var->getRegNum());
+  // Src must be reg == ECX or an Imm8.
+  // This is asserted by the assembler.
+  if (const Variable *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    assert(SrcVar->hasReg());
+    RegX8632::GPRRegister SrcReg =
+        RegX8632::getEncodedByteRegOrGPR(Ty, SrcVar->getRegNum());
+    (Asm->*(Emitter.GPRGPR))(Ty, VarReg, SrcReg);
+  } else if (const ConstantInteger32 *Imm =
+                 llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.GPRImm))(Ty, VarReg, x86::Immediate(Imm->getValue()));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+  Ostream &Str = Func->getContext()->getStrEmit();
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
+void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src,
+                     const x86::AssemblerX86::XmmEmitterShiftOp &Emitter) {
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  assert(Var->hasReg());
+  RegX8632::XmmRegister VarReg = RegX8632::getEncodedXmm(Var->getRegNum());
+  if (const Variable *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    if (SrcVar->hasReg()) {
+      RegX8632::XmmRegister SrcReg =
+          RegX8632::getEncodedXmm(SrcVar->getRegNum());
+      (Asm->*(Emitter.XmmXmm))(Ty, VarReg, SrcReg);
+    } else {
+      x86::Address SrcStackAddr = static_cast<TargetX8632 *>(Func->getTarget())
+                                      ->stackVarToAsmOperand(SrcVar);
+      (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcStackAddr);
+    }
+  } else if (const OperandX8632Mem *Mem =
+                 llvm::dyn_cast<OperandX8632Mem>(Src)) {
+    x86::Address SrcAddr = Mem->toAsmAddress(Asm);
+    (Asm->*(Emitter.XmmAddr))(Ty, VarReg, SrcAddr);
+  } else if (const ConstantInteger32 *Imm =
+                 llvm::dyn_cast<ConstantInteger32>(Src)) {
+    (Asm->*(Emitter.XmmImm))(Ty, VarReg, x86::Immediate(Imm->getValue()));
+  } else {
+    llvm_unreachable("Unexpected operand type");
+  }
+  Ostream &Str = Func->getContext()->getStrEmit();
+  emitIASBytes(Str, Asm, StartPosition);
+}
+
 void
 emitIASVarOperandTyXMM(const Cfg *Func, Type Ty, const Variable *Var,
                       const Operand *Src,
@@ -691,6 +750,20 @@ template <>
 const x86::AssemblerX86::GPREmitterRegOp InstX8632Xor::Emitter = {
    &x86::AssemblerX86::Xor, &x86::AssemblerX86::Xor, &x86::AssemblerX86::Xor};

+// Binary Shift GPR ops
+template <>
+const x86::AssemblerX86::GPREmitterShiftOp InstX8632Rol::Emitter = {
+    &x86::AssemblerX86::rol, &x86::AssemblerX86::rol};
+template <>
+const x86::AssemblerX86::GPREmitterShiftOp InstX8632Sar::Emitter = {
+    &x86::AssemblerX86::sar, &x86::AssemblerX86::sar};
+template <>
+const x86::AssemblerX86::GPREmitterShiftOp InstX8632Shl::Emitter = {
+    &x86::AssemblerX86::shl, &x86::AssemblerX86::shl};
+template <>
+const x86::AssemblerX86::GPREmitterShiftOp InstX8632Shr::Emitter = {
+    &x86::AssemblerX86::shr, &x86::AssemblerX86::shr};
+
 // Binary XMM ops
 template <>
 const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Addss::Emitter = {
@@ -726,6 +799,9 @@ template <>
 const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pcmpgt::Emitter = {
    &x86::AssemblerX86::pcmpgt, &x86::AssemblerX86::pcmpgt, NULL};
 template <>
+const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pmull::Emitter = {
+    &x86::AssemblerX86::pmull, &x86::AssemblerX86::pmull, NULL};
+template <>
 const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Pmuludq::Emitter = {
    &x86::AssemblerX86::pmuludq, &x86::AssemblerX86::pmuludq, NULL};
 template <>
@@ -744,6 +820,16 @@ template <>
 const x86::AssemblerX86::XmmEmitterTwoOps InstX8632Subps::Emitter = {
    &x86::AssemblerX86::subps, &x86::AssemblerX86::subps, NULL};

+// Binary XMM Shift ops
+template <>
+const x86::AssemblerX86::XmmEmitterShiftOp InstX8632Psll::Emitter = {
+    &x86::AssemblerX86::psll, &x86::AssemblerX86::psll,
+    &x86::AssemblerX86::psll};
+template <>
+const x86::AssemblerX86::XmmEmitterShiftOp InstX8632Psra::Emitter = {
+    &x86::AssemblerX86::psra, &x86::AssemblerX86::psra,
+    &x86::AssemblerX86::psra};
+
 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 1);
@@ -787,6 +873,22 @@ template <> void InstX8632Pmull::emit(const Cfg *Func) const {
  emitTwoAddress(buf, this, Func);
 }

+template <> void InstX8632Pmull::emitIAS(const Cfg *Func) const {
+  Type Ty = getDest()->getType();
+  bool TypesAreValid = Ty == IceType_v4i32 || Ty == IceType_v8i16;
+  bool InstructionSetIsValid =
+      Ty == IceType_v8i16 ||
+      static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
+          TargetX8632::SSE4_1;
+  (void)TypesAreValid;
+  (void)InstructionSetIsValid;
+  assert(TypesAreValid);
+  assert(InstructionSetIsValid);
+  assert(getSrcSize() == 2);
+  Type ElementTy = typeElementType(Ty);
+  emitIASVarOperandTyXMM(Func, ElementTy, getDest(), getSrc(1), Emitter);
+}
+
 template <> void InstX8632Subss::emit(const Cfg *Func) const {
  char buf[30];
  snprintf(buf, llvm::array_lengthof(buf), "sub%s",

--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -571,15 +571,15 @@ private:
 void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
                    bool ShiftHack = false);

-template <InstX8632::InstKindX8632 K, bool ShiftHack = false>
-class InstX8632Binop : public InstX8632 {
+template <InstX8632::InstKindX8632 K> class InstX8632Binop : public InstX8632 {
 public:
-  // Create a binary-op instruction like shifts.
+  // Create a binary-op instruction (not yet migrated to integrated assembler)
  static InstX8632Binop *create(Cfg *Func, Variable *Dest, Operand *Source) {
    return new (Func->allocate<InstX8632Binop>())
        InstX8632Binop(Func, Dest, Source);
  }
  void emit(const Cfg *Func) const override {
+    const bool ShiftHack = false;
    emitTwoAddress(Opcode, this, Func, ShiftHack);
  }
  void dump(const Cfg *Func) const override {
@@ -602,6 +602,49 @@ private:
  static const char *Opcode;
 };

+void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src,
+                     const x86::AssemblerX86::GPREmitterShiftOp &Emitter);
+
+template <InstX8632::InstKindX8632 K>
+class InstX8632BinopGPRShift : public InstX8632 {
+public:
+  // Create a binary-op GPR shift instruction.
+  static InstX8632BinopGPRShift *create(Cfg *Func, Variable *Dest,
+                                        Operand *Source) {
+    return new (Func->allocate<InstX8632BinopGPRShift>())
+        InstX8632BinopGPRShift(Func, Dest, Source);
+  }
+  void emit(const Cfg *Func) const override {
+    const bool ShiftHack = true;
+    emitTwoAddress(Opcode, this, Func, ShiftHack);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = getDest()->getType();
+    assert(getSrcSize() == 2);
+    emitIASGPRShift(Func, Ty, getDest(), getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstX8632BinopGPRShift(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX8632(Func, K, 2, Dest) {
+    addSource(Dest);
+    addSource(Source);
+  }
+  InstX8632BinopGPRShift(const InstX8632BinopGPRShift &) = delete;
+  InstX8632BinopGPRShift &operator=(const InstX8632BinopGPRShift &) = delete;
+  ~InstX8632BinopGPRShift() override {}
+  static const char *Opcode;
+  static const x86::AssemblerX86::GPREmitterShiftOp Emitter;
+};
+
 template <InstX8632::InstKindX8632 K>
 class InstX8632BinopGPR : public InstX8632 {
 public:
@@ -680,6 +723,52 @@ private:
  static const x86::AssemblerX86::XmmEmitterTwoOps Emitter;
 };

+void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
+                     const Operand *Src,
+                     const x86::AssemblerX86::XmmEmitterShiftOp &Emitter);
+
+template <InstX8632::InstKindX8632 K>
+class InstX8632BinopXmmShift : public InstX8632 {
+public:
+  // Create an XMM binary-op shift operation.
+  static InstX8632BinopXmmShift *create(Cfg *Func, Variable *Dest,
+                                        Operand *Source) {
+    return new (Func->allocate<InstX8632BinopXmmShift>())
+        InstX8632BinopXmmShift(Func, Dest, Source);
+  }
+  void emit(const Cfg *Func) const override {
+    const bool ShiftHack = false;
+    emitTwoAddress(Opcode, this, Func, ShiftHack);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = getDest()->getType();
+    assert(Ty == IceType_v8i16 || Ty == IceType_v8i1 || Ty == IceType_v4i32 ||
+           Ty == IceType_v4i1);
+    Type ElementTy = typeElementType(Ty);
+    assert(getSrcSize() == 2);
+    emitIASXmmShift(Func, ElementTy, getDest(), getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstX8632BinopXmmShift(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstX8632(Func, K, 2, Dest) {
+    addSource(Dest);
+    addSource(Source);
+  }
+  InstX8632BinopXmmShift(const InstX8632BinopXmmShift &) = delete;
+  InstX8632BinopXmmShift &operator=(const InstX8632BinopXmmShift &) = delete;
+  ~InstX8632BinopXmmShift() override {}
+  static const char *Opcode;
+  static const x86::AssemblerX86::XmmEmitterShiftOp Emitter;
+};
+
 template <InstX8632::InstKindX8632 K> class InstX8632Ternop : public InstX8632 {
 public:
  // Create a ternary-op instruction like div or idiv.
@@ -834,16 +923,16 @@ typedef InstX8632BinopXmm<InstX8632::Pxor, false> InstX8632Pxor;
 typedef InstX8632BinopGPR<InstX8632::Imul> InstX8632Imul;
 typedef InstX8632BinopXmm<InstX8632::Mulps, true> InstX8632Mulps;
 typedef InstX8632BinopXmm<InstX8632::Mulss, false> InstX8632Mulss;
-typedef InstX8632Binop<InstX8632::Pmull> InstX8632Pmull;
+typedef InstX8632BinopXmm<InstX8632::Pmull, true> InstX8632Pmull;
 typedef InstX8632BinopXmm<InstX8632::Pmuludq, false> InstX8632Pmuludq;
 typedef InstX8632BinopXmm<InstX8632::Divps, true> InstX8632Divps;
 typedef InstX8632BinopXmm<InstX8632::Divss, false> InstX8632Divss;
-typedef InstX8632Binop<InstX8632::Rol, true> InstX8632Rol;
-typedef InstX8632Binop<InstX8632::Shl, true> InstX8632Shl;
-typedef InstX8632Binop<InstX8632::Psll> InstX8632Psll;
-typedef InstX8632Binop<InstX8632::Shr, true> InstX8632Shr;
-typedef InstX8632Binop<InstX8632::Sar, true> InstX8632Sar;
-typedef InstX8632Binop<InstX8632::Psra> InstX8632Psra;
+typedef InstX8632BinopGPRShift<InstX8632::Rol> InstX8632Rol;
+typedef InstX8632BinopGPRShift<InstX8632::Shl> InstX8632Shl;
+typedef InstX8632BinopXmmShift<InstX8632::Psll> InstX8632Psll;
+typedef InstX8632BinopGPRShift<InstX8632::Shr> InstX8632Shr;
+typedef InstX8632BinopGPRShift<InstX8632::Sar> InstX8632Sar;
+typedef InstX8632BinopXmmShift<InstX8632::Psra> InstX8632Psra;
 typedef InstX8632BinopXmm<InstX8632::Pcmpeq, true> InstX8632Pcmpeq;
 typedef InstX8632BinopXmm<InstX8632::Pcmpgt, true> InstX8632Pcmpgt;
 // TODO: movss is only a binary operation when the source and dest
@@ -1446,6 +1535,7 @@ template <> void InstX8632Idiv::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Imul::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Cbwdq::emitIAS(const Cfg *Func) const;
 template <> void InstX8632Movd::emitIAS(const Cfg *Func) const;
+template <> void InstX8632Pmull::emitIAS(const Cfg *Func) const;

 } // end of namespace Ice


--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -537,6 +537,34 @@ void AssemblerX86::pandn(Type /* Ty */, XmmRegister dst, const Address &src) {
  EmitOperand(dst, src);
 }

+void AssemblerX86::pmull(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0xD5);
+  } else {
+    assert(Ty == IceType_i32);
+    EmitUint8(0x38);
+    EmitUint8(0x40);
+  }
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::pmull(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0xD5);
+  } else {
+    assert(Ty == IceType_i32);
+    EmitUint8(0x38);
+    EmitUint8(0x40);
+  }
+  EmitOperand(dst, src);
+}
+
 void AssemblerX86::pmuludq(Type /* Ty */, XmmRegister dst, XmmRegister src) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
  EmitUint8(0x66);
@@ -613,6 +641,88 @@ void AssemblerX86::pxor(Type /* Ty */, XmmRegister dst, const Address &src) {
  EmitOperand(dst, src);
 }

+void AssemblerX86::psll(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0xF1);
+  } else {
+    assert(Ty == IceType_i32);
+    EmitUint8(0xF2);
+  }
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::psll(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0xF1);
+  } else {
+    assert(Ty == IceType_i32);
+    EmitUint8(0xF2);
+  }
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::psll(Type Ty, XmmRegister dst, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(imm.is_int8());
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0x71);
+  } else {
+    assert(Ty == IceType_i32);
+    EmitUint8(0x72);
+  }
+  EmitRegisterOperand(6, dst);
+  EmitUint8(imm.value() & 0xFF);
+}
+
+void AssemblerX86::psra(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0xE1);
+  } else {
+    assert(Ty == IceType_i32);
+    EmitUint8(0xE2);
+  }
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::psra(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0xE1);
+  } else {
+    assert(Ty == IceType_i32);
+    EmitUint8(0xE2);
+  }
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::psra(Type Ty, XmmRegister dst, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(imm.is_int8());
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0x71);
+  } else {
+    assert(Ty == IceType_i32);
+    EmitUint8(0x72);
+  }
+  EmitRegisterOperand(4, dst);
+  EmitUint8(imm.value() & 0xFF);
+}
+
 // {add,sub,mul,div}ps are given a Ty parameter for consistency with
 // {add,sub,mul,div}ss. In the future, when the PNaCl ABI allows
 // addpd, etc., we can use the Ty parameter to decide on adding
@@ -1639,36 +1749,52 @@ void AssemblerX86::decl(const Address &address) {
  EmitOperand(1, address);
 }

-void AssemblerX86::shll(GPRRegister reg, const Immediate &imm) {
-  EmitGenericShift(4, reg, imm);
+void AssemblerX86::rol(Type Ty, GPRRegister reg, const Immediate &imm) {
+  EmitGenericShift(0, Ty, reg, imm);
+}
+
+void AssemblerX86::rol(Type Ty, GPRRegister operand, GPRRegister shifter) {
+  EmitGenericShift(0, Ty, Operand(operand), shifter);
+}
+
+void AssemblerX86::rol(Type Ty, const Address &operand, GPRRegister shifter) {
+  EmitGenericShift(0, Ty, operand, shifter);
+}
+
+void AssemblerX86::shl(Type Ty, GPRRegister reg, const Immediate &imm) {
+  EmitGenericShift(4, Ty, reg, imm);
+}
+
+void AssemblerX86::shl(Type Ty, GPRRegister operand, GPRRegister shifter) {
+  EmitGenericShift(4, Ty, Operand(operand), shifter);
 }

-void AssemblerX86::shll(GPRRegister operand, GPRRegister shifter) {
-  EmitGenericShift(4, Operand(operand), shifter);
+void AssemblerX86::shl(Type Ty, const Address &operand, GPRRegister shifter) {
+  EmitGenericShift(4, Ty, operand, shifter);
 }

-void AssemblerX86::shll(const Address &operand, GPRRegister shifter) {
-  EmitGenericShift(4, Operand(operand), shifter);
+void AssemblerX86::shr(Type Ty, GPRRegister reg, const Immediate &imm) {
+  EmitGenericShift(5, Ty, reg, imm);
 }

-void AssemblerX86::shrl(GPRRegister reg, const Immediate &imm) {
-  EmitGenericShift(5, reg, imm);
+void AssemblerX86::shr(Type Ty, GPRRegister operand, GPRRegister shifter) {
+  EmitGenericShift(5, Ty, Operand(operand), shifter);
 }

-void AssemblerX86::shrl(GPRRegister operand, GPRRegister shifter) {
-  EmitGenericShift(5, Operand(operand), shifter);
+void AssemblerX86::shr(Type Ty, const Address &operand, GPRRegister shifter) {
+  EmitGenericShift(5, Ty, operand, shifter);
 }

-void AssemblerX86::sarl(GPRRegister reg, const Immediate &imm) {
-  EmitGenericShift(7, reg, imm);
+void AssemblerX86::sar(Type Ty, GPRRegister reg, const Immediate &imm) {
+  EmitGenericShift(7, Ty, reg, imm);
 }

-void AssemblerX86::sarl(GPRRegister operand, GPRRegister shifter) {
-  EmitGenericShift(7, Operand(operand), shifter);
+void AssemblerX86::sar(Type Ty, GPRRegister operand, GPRRegister shifter) {
+  EmitGenericShift(7, Ty, Operand(operand), shifter);
 }

-void AssemblerX86::sarl(const Address &address, GPRRegister shifter) {
-  EmitGenericShift(7, Operand(address), shifter);
+void AssemblerX86::sar(Type Ty, const Address &address, GPRRegister shifter) {
+  EmitGenericShift(7, Ty, address, shifter);
 }

 void AssemblerX86::shld(GPRRegister dst, GPRRegister src) {
@@ -2129,27 +2255,31 @@ void AssemblerX86::EmitNearLabelLink(Label *label) {
  label->NearLinkTo(position);
 }

-void AssemblerX86::EmitGenericShift(int rm, GPRRegister reg,
+void AssemblerX86::EmitGenericShift(int rm, Type Ty, GPRRegister reg,
                                    const Immediate &imm) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
  assert(imm.is_int8());
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
  if (imm.value() == 1) {
-    EmitUint8(0xD1);
+    EmitUint8(isByteSizedArithType(Ty) ? 0xD0 : 0xD1);
    EmitOperand(rm, Operand(reg));
  } else {
-    EmitUint8(0xC1);
+    EmitUint8(isByteSizedArithType(Ty) ? 0xC0 : 0xC1);
    EmitOperand(rm, Operand(reg));
    EmitUint8(imm.value() & 0xFF);
  }
 }

-void AssemblerX86::EmitGenericShift(int rm, const Operand &operand,
+void AssemblerX86::EmitGenericShift(int rm, Type Ty, const Operand &operand,
                                    GPRRegister shifter) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
  assert(shifter == RegX8632::Encoded_Reg_ecx);
  (void)shifter;
-  EmitUint8(0xD3);
-  EmitOperand(rm, Operand(operand));
+  if (Ty == IceType_i16)
+    EmitOperandSizeOverride();
+  EmitUint8(isByteSizedArithType(Ty) ? 0xD2 : 0xD3);
+  EmitOperand(rm, operand);
 }

 } // end of namespace x86

--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -354,6 +354,13 @@ public:
    TypedEmitGPRImm GPRImm;
  };

+  struct GPREmitterShiftOp {
+    TypedEmitGPRGPR GPRGPR;
+    TypedEmitGPRImm GPRImm;
+    // Technically, Addr/GPR and Addr/Imm are also allowed, but */Addr are not.
+    // In practice, we always normalize the Dest to a Register first.
+  };
+
  // Operations to emit XMM instructions (and dispatch on operand type).
  typedef void (AssemblerX86::*TypedEmitXmmXmm)(Type, XmmRegister, XmmRegister);
  typedef void (AssemblerX86::*TypedEmitXmmAddr)(Type, XmmRegister,
@@ -366,6 +373,15 @@ public:
    TypedEmitAddrXmm AddrXmm;
  };

+  typedef void (AssemblerX86::*TypedEmitXmmImm)(Type, XmmRegister,
+                                                const Immediate &);
+
+  struct XmmEmitterShiftOp {
+    TypedEmitXmmXmm XmmXmm;
+    TypedEmitXmmAddr XmmAddr;
+    TypedEmitXmmImm XmmImm;
+  };
+
  /*
   * Emit Machine Instructions.
   */
@@ -453,6 +469,8 @@ public:
  void pand(Type Ty, XmmRegister dst, const Address &src);
  void pandn(Type Ty, XmmRegister dst, XmmRegister src);
  void pandn(Type Ty, XmmRegister dst, const Address &src);
+  void pmull(Type Ty, XmmRegister dst, XmmRegister src);
+  void pmull(Type Ty, XmmRegister dst, const Address &src);
  void pmuludq(Type Ty, XmmRegister dst, XmmRegister src);
  void pmuludq(Type Ty, XmmRegister dst, const Address &src);
  void por(Type Ty, XmmRegister dst, XmmRegister src);
@@ -462,6 +480,14 @@ public:
  void pxor(Type Ty, XmmRegister dst, XmmRegister src);
  void pxor(Type Ty, XmmRegister dst, const Address &src);

+  void psll(Type Ty, XmmRegister dst, XmmRegister src);
+  void psll(Type Ty, XmmRegister dst, const Address &src);
+  void psll(Type Ty, XmmRegister dst, const Immediate &src);
+
+  void psra(Type Ty, XmmRegister dst, XmmRegister src);
+  void psra(Type Ty, XmmRegister dst, const Address &src);
+  void psra(Type Ty, XmmRegister dst, const Immediate &src);
+
  void addps(Type Ty, XmmRegister dst, XmmRegister src);
  void addps(Type Ty, XmmRegister dst, const Address &src);
  void subps(Type Ty, XmmRegister dst, XmmRegister src);
@@ -629,14 +655,22 @@ public:
  void decl(GPRRegister reg);
  void decl(const Address &address);

-  void shll(GPRRegister reg, const Immediate &imm);
-  void shll(GPRRegister operand, GPRRegister shifter);
-  void shll(const Address &operand, GPRRegister shifter);
-  void shrl(GPRRegister reg, const Immediate &imm);
-  void shrl(GPRRegister operand, GPRRegister shifter);
-  void sarl(GPRRegister reg, const Immediate &imm);
-  void sarl(GPRRegister operand, GPRRegister shifter);
-  void sarl(const Address &address, GPRRegister shifter);
+  void rol(Type Ty, GPRRegister reg, const Immediate &imm);
+  void rol(Type Ty, GPRRegister operand, GPRRegister shifter);
+  void rol(Type Ty, const Address &operand, GPRRegister shifter);
+
+  void shl(Type Ty, GPRRegister reg, const Immediate &imm);
+  void shl(Type Ty, GPRRegister operand, GPRRegister shifter);
+  void shl(Type Ty, const Address &operand, GPRRegister shifter);
+
+  void shr(Type Ty, GPRRegister reg, const Immediate &imm);
+  void shr(Type Ty, GPRRegister operand, GPRRegister shifter);
+  void shr(Type Ty, const Address &operand, GPRRegister shifter);
+
+  void sar(Type Ty, GPRRegister reg, const Immediate &imm);
+  void sar(Type Ty, GPRRegister operand, GPRRegister shifter);
+  void sar(Type Ty, const Address &address, GPRRegister shifter);
+
  void shld(GPRRegister dst, GPRRegister src);
  void shld(GPRRegister dst, GPRRegister src, const Immediate &imm);
  void shld(const Address &operand, GPRRegister src);
@@ -721,8 +755,9 @@ private:
  void EmitLabelLink(Label *label);
  void EmitNearLabelLink(Label *label);

-  void EmitGenericShift(int rm, GPRRegister reg, const Immediate &imm);
-  void EmitGenericShift(int rm, const Operand &operand, GPRRegister shifter);
+  void EmitGenericShift(int rm, Type Ty, GPRRegister reg, const Immediate &imm);
+  void EmitGenericShift(int rm, Type Ty, const Operand &operand,
+                        GPRRegister shifter);

  AssemblerBuffer buffer_;


--- a/tests_lit/assembler/x86/immediate_encodings.ll
+++ b/tests_lit/assembler/x86/immediate_encodings.ll
@@ -254,5 +254,27 @@ entry:
 ; CHECK-LABEL: testMul32Imm16Neg
 ; CHECK: 69 c0 00 ff ff ff  imul eax, eax, 4294967040

+; The GPR shift instructions either allow an 8-bit immediate or
+; have a special encoding for "1".
+define internal i32 @testShl16Imm8(i32 %arg) {
+entry:
+  %arg_i16 = trunc i32 %arg to i16
+  %tmp = shl i16 %arg_i16, 13
+  %result = zext i16 %tmp to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testShl16Imm8
+; CHECK: 66 c1 e0 0d shl ax, 13
+
+define internal i32 @testShl16Imm1(i32 %arg) {
+entry:
+  %arg_i16 = trunc i32 %arg to i16
+  %tmp = shl i16 %arg_i16, 1
+  %result = zext i16 %tmp to i32
+  ret i32 %result
+}
+; CHECK-LABEL: testShl16Imm1
+; CHECK: 66 d1 e0 shl ax
+
 ; ERRORS-NOT: ICE translation error
 ; DUMP-NOT: SZ
--- a/tests_lit/assembler/x86/opcode_register_encodings.ll
+++ b/tests_lit/assembler/x86/opcode_register_encodings.ll
+; Tests various aspects of x86 opcode encodings. E.g., some opcodes like
+; those for pmull vary more wildly depending on operand size (rather than
+; follow a usual pattern).
+
+; RUN: %p2i -i %s --args -O2 -mattr=sse4.1 --verbose none \
+; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
+; RUN: %p2i -i %s --insts | %szdiff %s | FileCheck --check-prefix=DUMP %s
+
+define <8 x i16> @test_mul_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
+entry:
+  %res = mul <8 x i16> %arg0, %arg1
+  ret <8 x i16> %res
+; CHECK-LABEL: test_mul_v8i16
+; CHECK: 66 0f d5 c1 pmullw xmm0, xmm1
+}
+
+; Test register and address mode encoding.
+define <8 x i16> @test_mul_v8i16_more_regs(<8 x i1> %cond, <8 x i16> %arg0, <8 x i16> %arg1, <8 x i16> %arg2, <8 x i16> %arg3, <8 x i16> %arg4, <8 x i16> %arg5, <8 x i16> %arg6, <8 x i16> %arg7, <8 x i16> %arg8) {
+entry:
+  %res1 = mul <8 x i16> %arg0, %arg1
+  %res2 = mul <8 x i16> %arg0, %arg2
+  %res3 = mul <8 x i16> %arg0, %arg3
+  %res4 = mul <8 x i16> %arg0, %arg4
+  %res5 = mul <8 x i16> %arg0, %arg5
+  %res6 = mul <8 x i16> %arg0, %arg6
+  %res7 = mul <8 x i16> %arg0, %arg7
+  %res8 = mul <8 x i16> %arg0, %arg8
+  %res_acc1 = select <8 x i1> %cond, <8 x i16> %res1, <8 x i16> %res2
+  %res_acc2 = select <8 x i1> %cond, <8 x i16> %res3, <8 x i16> %res4
+  %res_acc3 = select <8 x i1> %cond, <8 x i16> %res5, <8 x i16> %res6
+  %res_acc4 = select <8 x i1> %cond, <8 x i16> %res7, <8 x i16> %res8
+  %res_acc1_3 = select <8 x i1> %cond, <8 x i16> %res_acc1, <8 x i16> %res_acc3
+  %res_acc2_4 = select <8 x i1> %cond, <8 x i16> %res_acc2, <8 x i16> %res_acc4
+  %res = select <8 x i1> %cond, <8 x i16> %res_acc1_3, <8 x i16> %res_acc2_4
+  ret <8 x i16> %res
+; CHECK-LABEL: test_mul_v8i16_more_regs
+; CHECK-DAG: 66 0f d5 c2 pmullw xmm0, xmm2
+; CHECK-DAG: 66 0f d5 c3 pmullw xmm0, xmm3
+; CHECK-DAG: 66 0f d5 c4 pmullw xmm0, xmm4
+; CHECK-DAG: 66 0f d5 c5 pmullw xmm0, xmm5
+; CHECK-DAG: 66 0f d5 c6 pmullw xmm0, xmm6
+; CHECK-DAG: 66 0f d5 c7 pmullw xmm0, xmm7
+; CHECK-DAG: 66 0f d5 44 24 70 pmullw xmm0, xmmword ptr [esp + 112]
+; CHECK-DAG: 66 0f d5 8c 24 80 00 00 00 pmullw xmm1, xmmword ptr [esp + 128]
+}
+
+define <4 x i32> @test_mul_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
+entry:
+  %res = mul <4 x i32> %arg0, %arg1
+  ret <4 x i32> %res
+; CHECK-LABEL: test_mul_v4i32
+; CHECK: 66 0f 38 40 c1  pmulld  xmm0, xmm1
+}
+
+define <4 x i32> @test_mul_v4i32_more_regs(<4 x i1> %cond, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, <4 x i32> %arg4, <4 x i32> %arg5, <4 x i32> %arg6, <4 x i32> %arg7, <4 x i32> %arg8) {
+entry:
+  %res1 = mul <4 x i32> %arg0, %arg1
+  %res2 = mul <4 x i32> %arg0, %arg2
+  %res3 = mul <4 x i32> %arg0, %arg3
+  %res4 = mul <4 x i32> %arg0, %arg4
+  %res5 = mul <4 x i32> %arg0, %arg5
+  %res6 = mul <4 x i32> %arg0, %arg6
+  %res7 = mul <4 x i32> %arg0, %arg7
+  %res8 = mul <4 x i32> %arg0, %arg8
+  %res_acc1 = select <4 x i1> %cond, <4 x i32> %res1, <4 x i32> %res2
+  %res_acc2 = select <4 x i1> %cond, <4 x i32> %res3, <4 x i32> %res4
+  %res_acc3 = select <4 x i1> %cond, <4 x i32> %res5, <4 x i32> %res6
+  %res_acc4 = select <4 x i1> %cond, <4 x i32> %res7, <4 x i32> %res8
+  %res_acc1_3 = select <4 x i1> %cond, <4 x i32> %res_acc1, <4 x i32> %res_acc3
+  %res_acc2_4 = select <4 x i1> %cond, <4 x i32> %res_acc2, <4 x i32> %res_acc4
+  %res = select <4 x i1> %cond, <4 x i32> %res_acc1_3, <4 x i32> %res_acc2_4
+  ret <4 x i32> %res
+; CHECK-LABEL: test_mul_v4i32_more_regs
+; CHECK-DAG: 66 0f 38 40 c2 pmulld xmm0, xmm2
+; CHECK-DAG: 66 0f 38 40 c3 pmulld xmm0, xmm3
+; CHECK-DAG: 66 0f 38 40 c4 pmulld xmm0, xmm4
+; CHECK-DAG: 66 0f 38 40 c5 pmulld xmm0, xmm5
+; CHECK-DAG: 66 0f 38 40 c6 pmulld xmm0, xmm6
+; CHECK-DAG: 66 0f 38 40 c7 pmulld xmm0, xmm7
+; CHECK-DAG: 66 0f 38 40 44 24 70 pmulld xmm0, xmmword ptr [esp + 112]
+; CHECK-DAG: 66 0f 38 40 8c 24 80 00 00 00 pmulld xmm1, xmmword ptr [esp + 128]
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ
--- a/tests_lit/llvm2ice_tests/8bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
@@ -159,6 +159,69 @@ entry:
 ; CHECK-LABEL: srem8BitConst
 ; CHECK: idiv {{[abcd]l|byte ptr}}

+define internal i32 @shl8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %shl = shl i8 %b_8, %a_8
+  %ret = zext i8 %shl to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: shl8Bit
+; CHECK: shl {{[abd]l|byte ptr}}, cl
+
+define internal i32 @shl8BitConst(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %shl = shl i8 %a_8, 6
+  %ret = zext i8 %shl to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: shl8BitConst
+; CHECK: shl {{[abcd]l|byte ptr}}, 6
+
+define internal i32 @lshr8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %lshr = lshr i8 %b_8, %a_8
+  %ret = zext i8 %lshr to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: lshr8Bit
+; CHECK: shr {{[abd]l|byte ptr}}, cl
+
+define internal i32 @lshr8BitConst(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %lshr = lshr i8 %a_8, 6
+  %ret = zext i8 %lshr to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: lshr8BitConst
+; CHECK: shr {{[abcd]l|byte ptr}}, 6
+
+define internal i32 @ashr8Bit(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %b_8 = trunc i32 %b to i8
+  %ashr = ashr i8 %b_8, %a_8
+  %ret = zext i8 %ashr to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: ashr8Bit
+; CHECK: sar {{[abd]l|byte ptr}}, cl
+
+define internal i32 @ashr8BitConst(i32 %a, i32 %b) {
+entry:
+  %a_8 = trunc i32 %a to i8
+  %ashr = ashr i8 %a_8, 6
+  %ret = zext i8 %ashr to i32
+  ret i32 %ret
+}
+; CHECK-LABEL: ashr8BitConst
+; CHECK: sar {{[abcd]l|byte ptr}}, 6
+

 ; ERRORS-NOT: ICE translation error
 ; DUMP-NOT: SZ
--- a/tests_lit/llvm2ice_tests/address-mode-opt.ll
+++ b/tests_lit/llvm2ice_tests/address-mode-opt.ll
@@ -3,6 +3,10 @@
 ; RUN: %p2i -i %s --args -O2 --verbose none \
 ; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
 ; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args -O2 -mattr=sse4.1 --verbose none \
+; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - \
+; RUN:   | FileCheck --check-prefix=SSE41 %s
 ; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s

 define float @load_arg_plus_200000(float* %arg) {
@@ -49,6 +53,32 @@ entry:
 ; CHECK: movss xmm0, dword ptr [e{{..}}]
 }

+define <8 x i16> @load_mul_v8i16_mem(<8 x i16> %arg0, i32 %arg1_iptr) {
+entry:
+  %addr_sub = sub i32 %arg1_iptr, 200000
+  %addr_ptr = inttoptr i32 %addr_sub to <8 x i16>*
+  %arg1 = load <8 x i16>* %addr_ptr, align 2
+  %res_vec = mul <8 x i16> %arg0, %arg1
+  ret <8 x i16> %res_vec
+; CHECK-LABEL: load_mul_v8i16_mem:
+; CHECK: pmullw xmm{{.*}}, xmmword ptr [e{{.*}} - 200000]
+}
+
+define <4 x i32> @load_mul_v4i32_mem(<4 x i32> %arg0, i32 %arg1_iptr) {
+entry:
+  %addr_sub = sub i32 %arg1_iptr, 200000
+  %addr_ptr = inttoptr i32 %addr_sub to <4 x i32>*
+  %arg1 = load <4 x i32>* %addr_ptr, align 4
+  %res = mul <4 x i32> %arg0, %arg1
+  ret <4 x i32> %res
+; CHECK-LABEL: load_mul_v4i32_mem:
+; CHECK: pmuludq xmm{{.*}}, xmmword ptr [e{{.*}} - 200000]
+; CHECK: pmuludq
+;
+; SSE41-LABEL: load_mul_v4i32_mem:
+; SSE41: pmulld xmm{{.*}}, xmmword ptr [e{{.*}} - 200000]
+}
+
 define float @address_mode_opt_chaining(float* %arg) {
 entry:
  %arg.int = ptrtoint float* %arg to i32

--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -326,7 +326,9 @@ entry:
  ret i32 %r_zext
 }
 ; CHECK-LABEL: test_bswap_16
-; CHECK: rol {{.*}}, 8
+; Make sure this is the right operand size so that the most significant bit
+; to least significant bit rotation happens at the right boundary.
+; CHECK: rol {{[abcd]x|si|di|bp|word ptr}}, 8

 define i32 @test_bswap_32(i32 %x) {
 entry:

--- a/tests_lit/llvm2ice_tests/vector-arith.ll
+++ b/tests_lit/llvm2ice_tests/vector-arith.ll
@@ -21,7 +21,6 @@
 ; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - \
 ; RUN:   | FileCheck --check-prefix=SSE41 %s
 ; RUN: %p2i -i %s -a --verbose none | FileCheck --check-prefix=ERRORS %s
-; RUN: %p2i -i %s --insts | %szdiff %s | FileCheck --check-prefix=DUMP %s

 define <4 x float> @test_fadd(<4 x float> %arg0, <4 x float> %arg1) {
 entry: