Lower insertelement and extractelement.

Use instructions that do the operations in registers and that are available in SSE2. Spill to memory to perform the operation in the absence of any other reasonable options (v16i8 and v16i1). Unfortunately there is no natural class of SSE2 instructions that insertelement / extractelement can get lowered to for all vector types (though pinsr[bwd] and pextr[bwd] are available in SSE4.1). There are in some cases a large number of choices available for lowering and I have not looked into which choices are the best yet, besides using LLVM output as a guide. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/401523003

Lower insertelement and extractelement.
49889239 · Matt Wala · 7fa22d8a · 49889239 · 49889239 · 49889239
Commit 49889239 authored Jul 18, 2014 by Matt Wala
15 changed files
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -82,6 +82,13 @@ for optlevel in ${OPTLEVELS} ; do
       --driver=test_sync_atomic_main.cpp \
       --output=test_sync_atomic_O${optlevel}

+    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+        --dir="${OUTDIR}" \
+        --llvm-bin-path="${LLVM_BIN_PATH}" \
+        --test=test_vector_ops.ll \
+        --driver=test_vector_ops_main.cpp \
+        --output=test_vector_ops_O${optlevel}
+
 done

 for optlevel in ${OPTLEVELS} ; do
@@ -94,4 +101,5 @@ for optlevel in ${OPTLEVELS} ; do
    "${OUTDIR}"/test_global_O${optlevel}
    "${OUTDIR}"/test_icmp_O${optlevel}
    "${OUTDIR}"/test_sync_atomic_O${optlevel}
+    "${OUTDIR}"/test_vector_ops_O${optlevel}
 done
--- a/crosstest/test_vector_ops.def
+++ b/crosstest/test_vector_ops.def
+#ifndef TEST_VECTOR_OPS_DEF
+
+#define VECTOR_TYPE_TABLE                 \
+/* typename, element type,  cast type */  \
+X(v16si8,          int8_t,  int64_t)      \
+X(v16ui8,         uint8_t,  int64_t)      \
+X(v8si16,         int16_t,  int64_t)      \
+X(v8ui16,        uint16_t,  int64_t)      \
+X(v4si32,         int32_t,  int64_t)      \
+X(v4ui32,        uint32_t,  int64_t)      \
+X(v4f32,            float,    float)      \
+
+#define I1_VECTOR_TYPE_TABLE              \
+/* typename, expanded type, # elements */ \
+X(v4i1,             v4ui32,          4)   \
+X(v8i1,             v8ui16,          8)   \
+X(v16i1,            v16ui8,         16)   \
+
+#endif
--- a/crosstest/test_vector_ops.ll
+++ b/crosstest/test_vector_ops.ll
--- a/crosstest/test_vector_ops_main.cpp
+++ b/crosstest/test_vector_ops_main.cpp
+/* crosstest.py --test=test_vector_ops.ll  --driver=test_vector_ops_main.cpp \
+   --prefix=Subzero_ --output=test_vector_ops */
+
+#include <stdint.h>
+#include <cstring>
+#include <sstream>
+#include <iostream>
+#include <limits>
+#include <utility>
+#include <vector>
+#include <stdlib.h>
+
+#include "test_vector_ops.def"
+
+// typedefs of native C++ SIMD vector types
+#define X(ty, elty, castty) typedef elty ty __attribute__((vector_size(16)));
+VECTOR_TYPE_TABLE
+#undef X
+
+// i1 vector types are not native C++ SIMD vector types. Instead, they
+// are expanded by the test code into native 128 bit SIMD vector types
+// with the appropriate number of elements. Representing the types in
+// VectorOps<> requires a unique name for each type which this
+// declaration provides.
+#define X(ty, expandedty, num_elements)                                        \
+  class ty;
+I1_VECTOR_TYPE_TABLE
+#undef X
+
+template <typename T> struct VectorOps;
+
+#define DECLARE_VECTOR_OPS(TYNAME, TY, ELTY, CASTTY, NUM_ELEMENTS)             \
+  template <> struct VectorOps<TYNAME> {                                       \
+    typedef TY Ty;                                                             \
+    typedef ELTY ElementTy;                                                    \
+    typedef CASTTY CastTy;                                                     \
+    static TY (*insertelement)(TY, CASTTY, int32_t);                           \
+    static TY (*Subzero_insertelement)(TY, CASTTY, int32_t);                   \
+    static CASTTY (*extractelement)(TY, int32_t);                              \
+    static CASTTY (*Subzero_extractelement)(TY, int32_t);                      \
+    static size_t NumElements;                                                 \
+    static const char *TypeName;                                               \
+  };                                                                           \
+  extern "C" TY insertelement_##TYNAME(TY, CASTTY, int32_t);                   \
+  extern "C" TY Subzero_insertelement_##TYNAME(TY, CASTTY, int32_t);           \
+  extern "C" CASTTY extractelement_##TYNAME(TY, int32_t);                      \
+  extern "C" CASTTY Subzero_extractelement_##TYNAME(TY, int32_t);              \
+  size_t VectorOps<TYNAME>::NumElements = NUM_ELEMENTS;                        \
+  TY (*VectorOps<TYNAME>::insertelement)(TY, CASTTY, int32_t) =                \
+      &insertelement_##TYNAME;                                                 \
+  TY (*VectorOps<TYNAME>::Subzero_insertelement)(TY, CASTTY, int32_t) =        \
+      &Subzero_insertelement_##TYNAME;                                         \
+  CASTTY (*VectorOps<TYNAME>::extractelement)(TY, int32_t) =                   \
+      &extractelement_##TYNAME;                                                \
+  CASTTY (*VectorOps<TYNAME>::Subzero_extractelement)(TY, int32_t) =           \
+      &Subzero_extractelement_##TYNAME;                                        \
+  const char *VectorOps<TYNAME>::TypeName = #TYNAME;
+
+#define X(ty, elty, castty)                                                    \
+  DECLARE_VECTOR_OPS(ty, ty, elty, castty, (sizeof(ty) / sizeof(elty)))
+VECTOR_TYPE_TABLE
+#undef X
+
+#define X(ty, expandedty, num_elements)                                        \
+  DECLARE_VECTOR_OPS(ty, expandedty, bool, int64_t, num_elements)
+I1_VECTOR_TYPE_TABLE
+#undef X
+
+template <typename T>
+std::string vectAsString(const typename VectorOps<T>::Ty Vect) {
+  std::ostringstream OS;
+  for (size_t I = 0; I < VectorOps<T>::NumElements; ++I) {
+    if (I > 0)
+      OS << " ";
+    OS << (typename VectorOps<T>::CastTy)Vect[I];
+  }
+  return OS.str();
+}
+
+template <typename T>
+typename VectorOps<T>::Ty *getTestVectors(size_t &NumTestVectors) {
+  typedef typename VectorOps<T>::Ty Ty;
+  typedef typename VectorOps<T>::ElementTy ElementTy;
+
+  Ty Zero;
+  memset(&Zero, 0, sizeof(Zero));
+  Ty Incr;
+  // Note: The casts in the next two initializations are necessary,
+  // since ElementTy isn't necessarily the type that the value is stored
+  // in the vector.
+  for (int I = 0; I < VectorOps<T>::NumElements; ++I)
+    Incr[I] = (ElementTy)I;
+  Ty Decr;
+  for (int I = 0; I < VectorOps<T>::NumElements; ++I)
+    Decr[I] = (ElementTy)-I;
+  Ty Min;
+  for (int I = 0; I < VectorOps<T>::NumElements; ++I)
+    Min[I] = std::numeric_limits<ElementTy>::min();
+  Ty Max;
+  for (int I = 0; I < VectorOps<T>::NumElements; ++I)
+    Max[I] = std::numeric_limits<ElementTy>::max();
+  Ty TestVectors[] = {Zero, Incr, Decr, Min, Max};
+
+  NumTestVectors = sizeof(TestVectors) / sizeof(Ty);
+
+  const size_t VECTOR_ALIGNMENT = 16;
+  void *Dest;
+  if (posix_memalign(&Dest, VECTOR_ALIGNMENT, sizeof(TestVectors))) {
+    std::cerr << "memory allocation error" << std::endl;
+    abort();
+  }
+
+  memcpy(Dest, TestVectors, sizeof(TestVectors));
+
+  return static_cast<Ty *>(Dest);
+}
+
+template <typename T>
+void testInsertElement(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename VectorOps<T>::Ty Ty;
+  typedef typename VectorOps<T>::ElementTy ElementTy;
+
+  size_t NumTestVectors;
+  Ty *TestVectors = getTestVectors<T>(NumTestVectors);
+
+  ElementTy TestElements[] = {0, 1, std::numeric_limits<ElementTy>::min(),
+                              std::numeric_limits<ElementTy>::max()};
+  const size_t NumTestElements = sizeof(TestElements) / sizeof(ElementTy);
+
+  for (size_t VI = 0; VI < NumTestVectors; ++VI) {
+    Ty Vect = TestVectors[VI];
+    for (size_t EI = 0; EI < NumTestElements; ++EI) {
+      ElementTy Elt = TestElements[EI];
+      for (size_t I = 0; I < VectorOps<T>::NumElements; ++I) {
+        Ty ResultLlc = VectorOps<T>::insertelement(Vect, Elt, I);
+        Ty ResultSz = VectorOps<T>::Subzero_insertelement(Vect, Elt, I);
+        ++TotalTests;
+        if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+          ++Passes;
+        } else {
+          ++Failures;
+          std::cout << "insertelement<" << VectorOps<T>::TypeName << ">(Vect=";
+          std::cout << vectAsString<T>(Vect)
+                    << ", Element=" << (typename VectorOps<T>::CastTy)Elt
+                    << ", Pos=" << I << ")" << std::endl;
+          std::cout << "llc=" << vectAsString<T>(ResultLlc) << std::endl;
+          std::cout << "sz =" << vectAsString<T>(ResultSz) << std::endl;
+        }
+      }
+    }
+  }
+
+  free(TestVectors);
+}
+
+template <typename T>
+void testExtractElement(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename VectorOps<T>::Ty Ty;
+  typedef typename VectorOps<T>::ElementTy ElementTy;
+  typedef typename VectorOps<T>::CastTy CastTy;
+
+  size_t NumTestVectors;
+  Ty *TestVectors = getTestVectors<T>(NumTestVectors);
+
+  for (size_t VI = 0; VI < NumTestVectors; ++VI) {
+    Ty Vect = TestVectors[VI];
+    for (size_t I = 0; I < VectorOps<T>::NumElements; ++I) {
+      CastTy ResultLlc = VectorOps<T>::extractelement(Vect, I);
+      CastTy ResultSz = VectorOps<T>::Subzero_extractelement(Vect, I);
+      ++TotalTests;
+      if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "extractelement<" << VectorOps<T>::TypeName << ">(Vect=";
+        std::cout << vectAsString<T>(Vect) << ", Pos=" << I << ")" << std::endl;
+        std::cout << "llc=" << ResultLlc << std::endl;
+        std::cout << "sz =" << ResultSz << std::endl;
+      }
+    }
+  }
+
+  free(TestVectors);
+}
+
+int main(int argc, char *argv[]) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+
+  testInsertElement<v4i1>(TotalTests, Passes, Failures);
+  testInsertElement<v8i1>(TotalTests, Passes, Failures);
+  testInsertElement<v16i1>(TotalTests, Passes, Failures);
+  testInsertElement<v16si8>(TotalTests, Passes, Failures);
+  testInsertElement<v16ui8>(TotalTests, Passes, Failures);
+  testInsertElement<v8si16>(TotalTests, Passes, Failures);
+  testInsertElement<v8ui16>(TotalTests, Passes, Failures);
+  testInsertElement<v4si32>(TotalTests, Passes, Failures);
+  testInsertElement<v4ui32>(TotalTests, Passes, Failures);
+  testInsertElement<v4f32>(TotalTests, Passes, Failures);
+
+  testExtractElement<v4i1>(TotalTests, Passes, Failures);
+  testExtractElement<v8i1>(TotalTests, Passes, Failures);
+  testExtractElement<v16i1>(TotalTests, Passes, Failures);
+  testExtractElement<v16si8>(TotalTests, Passes, Failures);
+  testExtractElement<v16ui8>(TotalTests, Passes, Failures);
+  testExtractElement<v8si16>(TotalTests, Passes, Failures);
+  testExtractElement<v8ui16>(TotalTests, Passes, Failures);
+  testExtractElement<v4si32>(TotalTests, Passes, Failures);
+  testExtractElement<v4ui32>(TotalTests, Passes, Failures);
+  testExtractElement<v4f32>(TotalTests, Passes, Failures);
+
+  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+            << " Failures=" << Failures << "\n";
+
+  return Failures;
+}
+
+extern "C" {
+
+void ice_unreachable(void) {
+  std::cerr << "\"unreachable\" instruction encountered" << std::endl;
+  abort();
+}
+}
--- a/src/IceConverter.cpp
+++ b/src/IceConverter.cpp
@@ -337,6 +337,10 @@ private:
      return convertArithInstruction(Inst, Ice::InstArithmetic::Or);
    case Instruction::Xor:
      return convertArithInstruction(Inst, Ice::InstArithmetic::Xor);
+    case Instruction::ExtractElement:
+      return convertExtractElementInstruction(cast<ExtractElementInst>(Inst));
+    case Instruction::InsertElement:
+      return convertInsertElementInstruction(cast<InsertElementInst>(Inst));
    case Instruction::Call:
      return convertCallInstruction(cast<CallInst>(Inst));
    case Instruction::Alloca:
@@ -534,6 +538,22 @@ private:
    return Ice::InstFcmp::create(Func, Cond, Dest, Src0, Src1);
  }

+  Ice::Inst *convertExtractElementInstruction(const ExtractElementInst *Inst) {
+    Ice::Variable *Dest = mapValueToIceVar(Inst);
+    Ice::Operand *Source1 = convertValue(Inst->getOperand(0));
+    Ice::Operand *Source2 = convertValue(Inst->getOperand(1));
+    return Ice::InstExtractElement::create(Func, Dest, Source1, Source2);
+  }
+
+  Ice::Inst *convertInsertElementInstruction(const InsertElementInst *Inst) {
+    Ice::Variable *Dest = mapValueToIceVar(Inst);
+    Ice::Operand *Source1 = convertValue(Inst->getOperand(0));
+    Ice::Operand *Source2 = convertValue(Inst->getOperand(1));
+    Ice::Operand *Source3 = convertValue(Inst->getOperand(2));
+    return Ice::InstInsertElement::create(Func, Dest, Source1, Source2,
+                                          Source3);
+  }
+
  Ice::Inst *convertSelectInstruction(const SelectInst *Inst) {
    Ice::Variable *Dest = mapValueToIceVar(Inst);
    Ice::Operand *Cond = convertValue(Inst->getCondition());

--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -267,6 +267,13 @@ InstCast::InstCast(Cfg *Func, OpKind CastKind, Variable *Dest, Operand *Source)
  addSource(Source);
 }

+InstExtractElement::InstExtractElement(Cfg *Func, Variable *Dest,
+                                       Operand *Source1, Operand *Source2)
+    : Inst(Func, Inst::ExtractElement, 2, Dest) {
+  addSource(Source1);
+  addSource(Source2);
+}
+
 InstFcmp::InstFcmp(Cfg *Func, FCond Condition, Variable *Dest, Operand *Source1,
                   Operand *Source2)
    : Inst(Func, Inst::Fcmp, 2, Dest), Condition(Condition) {
@@ -281,6 +288,15 @@ InstIcmp::InstIcmp(Cfg *Func, ICond Condition, Variable *Dest, Operand *Source1,
  addSource(Source2);
 }

+InstInsertElement::InstInsertElement(Cfg *Func, Variable *Dest,
+                                     Operand *Source1, Operand *Source2,
+                                     Operand *Source3)
+    : Inst(Func, Inst::InsertElement, 3, Dest) {
+  addSource(Source1);
+  addSource(Source2);
+  addSource(Source3);
+}
+
 InstLoad::InstLoad(Cfg *Func, Variable *Dest, Operand *SourceAddr)
    : Inst(Func, Inst::Load, 1, Dest) {
  addSource(SourceAddr);
@@ -586,6 +602,31 @@ void InstIcmp::dump(const Cfg *Func) const {
  dumpSources(Func);
 }

+void InstExtractElement::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << " = extractelement ";
+  Str << getSrc(0)->getType() << " ";
+  getSrc(0)->dump(Func);
+  Str << ", ";
+  Str << getSrc(1)->getType() << " ";
+  getSrc(1)->dump(Func);
+};
+
+void InstInsertElement::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << " = insertelement ";
+  Str << getSrc(0)->getType() << " ";
+  getSrc(0)->dump(Func);
+  Str << ", ";
+  Str << getSrc(1)->getType() << " ";
+  getSrc(1)->dump(Func);
+  Str << ", ";
+  Str << getSrc(2)->getType() << " ";
+  getSrc(2)->dump(Func);
+};
+
 void InstFcmp::dump(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrDump();
  dumpDest(Func);

--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -41,9 +41,11 @@ public:
    Br,
    Call,
    Cast,
+    ExtractElement,
    Fcmp,
    Icmp,
    IntrinsicCall,
+    InsertElement,
    Load,
    Phi,
    Ret,
@@ -344,6 +346,29 @@ private:
  const OpKind CastKind;
 };

+// ExtractElement instruction.
+class InstExtractElement : public Inst {
+public:
+  static InstExtractElement *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                                    Operand *Source2) {
+    return new (Func->allocateInst<InstExtractElement>())
+        InstExtractElement(Func, Dest, Source1, Source2);
+  }
+
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) {
+    return Inst->getKind() == ExtractElement;
+  }
+
+private:
+  InstExtractElement(Cfg *Func, Variable *Dest, Operand *Source1,
+                     Operand *Source2);
+  InstExtractElement(const InstExtractElement &) LLVM_DELETED_FUNCTION;
+  InstExtractElement &
+  operator=(const InstExtractElement &) LLVM_DELETED_FUNCTION;
+  virtual ~InstExtractElement() {}
+};
+
 // Floating-point comparison instruction.  The source operands are
 // captured in getSrc(0) and getSrc(1).
 class InstFcmp : public Inst {
@@ -402,6 +427,28 @@ private:
  const ICond Condition;
 };

+// InsertElement instruction.
+class InstInsertElement : public Inst {
+public:
+  static InstInsertElement *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                                   Operand *Source2, Operand *Source3) {
+    return new (Func->allocateInst<InstInsertElement>())
+        InstInsertElement(Func, Dest, Source1, Source2, Source3);
+  }
+
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) {
+    return Inst->getKind() == InsertElement;
+  }
+
+private:
+  InstInsertElement(Cfg *Func, Variable *Dest, Operand *Source1,
+                    Operand *Source2, Operand *Source3);
+  InstInsertElement(const InstInsertElement &) LLVM_DELETED_FUNCTION;
+  InstInsertElement &operator=(const InstInsertElement &) LLVM_DELETED_FUNCTION;
+  virtual ~InstInsertElement() {}
+};
+
 // Call to an intrinsic function.  The call target is captured as getSrc(0),
 // and arg I is captured as getSrc(I+1).
 class InstIntrinsicCall : public InstCall {

--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -42,7 +42,7 @@ const struct TypeX8632Attributes_ {
  const char *PackString;  // b, w, d, or <blank>
  const char *WidthString; // {byte,word,dword,qword} ptr
 } TypeX8632Attributes[] = {
-#define X(tag, cvt, sdss, pack, width)                                         \
+#define X(tag, elementty, cvt, sdss, pack, width)                              \
  { cvt, "" sdss, pack, width }                                                \
  ,
    ICETYPEX8632_TABLE
@@ -312,21 +312,6 @@ bool InstX8632Movq::isRedundantAssign() const {
  return false;
 }

-InstX8632Pshufd::InstX8632Pshufd(Cfg *Func, Variable *Dest, Operand *Source1,
-                                 Operand *Source2)
-    : InstX8632(Func, InstX8632::Pshufd, 2, Dest) {
-  addSource(Source1);
-  addSource(Source2);
-}
-
-InstX8632Shufps::InstX8632Shufps(Cfg *Func, Variable *Dest, Operand *Source1,
-                                 Operand *Source2)
-    : InstX8632(Func, InstX8632::Shufps, 3, Dest) {
-  addSource(Dest);
-  addSource(Source1);
-  addSource(Source2);
-}
-
 InstX8632Ret::InstX8632Ret(Cfg *Func, Variable *Source)
    : InstX8632(Func, InstX8632::Ret, Source ? 1 : 0, NULL) {
  if (Source)
@@ -454,9 +439,15 @@ void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
  Str << "\n";
 }

+
+// Unary ops
 template <> const char *InstX8632Bsf::Opcode = "bsf";
 template <> const char *InstX8632Bsr::Opcode = "bsr";
+template <> const char *InstX8632Lea::Opcode = "lea";
+template <> const char *InstX8632Movd::Opcode = "movd";
+template <> const char *InstX8632Movss::Opcode = "movss";
 template <> const char *InstX8632Sqrtss::Opcode = "sqrtss";
+// Binary ops
 template <> const char *InstX8632Add::Opcode = "add";
 template <> const char *InstX8632Addps::Opcode = "addps";
 template <> const char *InstX8632Adc::Opcode = "adc";
@@ -489,6 +480,12 @@ template <> const char *InstX8632Sar::Opcode = "sar";
 template <> const char *InstX8632Psra::Opcode = "psra";
 template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
 template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
+// Ternary ops
+template <> const char *InstX8632Shufps::Opcode = "shufps";
+template <> const char *InstX8632Pinsrw::Opcode = "pinsrw";
+// Three address ops
+template <> const char *InstX8632Pextrw::Opcode = "pextrw";
+template <> const char *InstX8632Pshufd::Opcode = "pshufd";

 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
@@ -556,6 +553,22 @@ template <> void InstX8632Divss::emit(const Cfg *Func) const {
  emitTwoAddress(buf, this, Func);
 }

+template <> void InstX8632Div::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 3);
+  Str << "\t" << Opcode << "\t";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+template <> void InstX8632Idiv::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 3);
+  Str << "\t" << Opcode << "\t";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
 template <> void InstX8632Imul::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 2);
@@ -868,6 +881,25 @@ void InstX8632StoreQ::dump(const Cfg *Func) const {
  getSrc(0)->dump(Func);
 }

+template <> void InstX8632Lea::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  assert(getDest()->hasReg());
+  Str << "\tlea\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  Operand *Src0 = getSrc(0);
+  if (Variable *VSrc0 = llvm::dyn_cast<Variable>(Src0)) {
+    Type Ty = VSrc0->getType();
+    // lea on x86-32 doesn't accept mem128 operands, so cast VSrc0 to an
+    // acceptable type.
+    VSrc0->asType(isVectorType(Ty) ? IceType_i32 : Ty).emit(Func);
+  } else {
+    Src0->emit(Func);
+  }
+  Str << "\n";
+}
+
 void InstX8632Mov::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 1);
@@ -893,6 +925,9 @@ void InstX8632Mov::emit(const Cfg *Func) const {
  // safe, we instead widen the dest to match src.  This works even
  // for stack-allocated dest variables because typeWidthOnStack()
  // pads to a 4-byte boundary even if only a lower portion is used.
+  // TODO: This assert disallows usages such as copying a floating point
+  // value between a vector and a scalar (which movss is used for).
+  // Clean this up.
  assert(Func->getTarget()->typeWidthInBytesOnStack(getDest()->getType()) ==
         Func->getTarget()->typeWidthInBytesOnStack(Src->getType()));
  getDest()->asType(Src->getType()).emit(Func);
@@ -1066,6 +1101,39 @@ template <> void InstX8632Pcmpgt::emit(const Cfg *Func) const {
  emitTwoAddress(buf, this, Func);
 }

+template <> void InstX8632Pextrw::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Str << "\t" << Opcode << "\t";
+  Variable *Dest = getDest();
+  assert(Dest->hasReg() && Dest->getType() == IceType_i16);
+  // pextrw takes r32 dest.
+  Dest->asType(IceType_i32).emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+template <> void InstX8632Pinsrw::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 3);
+  Str << "\t" << Opcode << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  Operand *Src1 = getSrc(1);
+  if (Variable *VSrc1 = llvm::dyn_cast<Variable>(Src1)) {
+    // If src1 is a register, it should be r32.
+    VSrc1->asType(VSrc1->hasReg() ? IceType_i32 : IceType_i16).emit(Func);
+  } else {
+    Src1->emit(Func);
+  }
+  Str << ", ";
+  getSrc(2)->emit(Func);
+  Str << "\n";
+}
+
 void InstX8632Pop::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 0);
@@ -1138,25 +1206,6 @@ template <> void InstX8632Psra::emit(const Cfg *Func) const {
  emitTwoAddress(buf, this, Func);
 }

-void InstX8632Pshufd::emit(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(getSrcSize() == 2);
-  Str << "\tpshufd\t";
-  getDest()->emit(Func);
-  Str << ", ";
-  getSrc(0)->emit(Func);
-  Str << ", ";
-  getSrc(1)->emit(Func);
-  Str << "\n";
-}
-
-void InstX8632Pshufd::dump(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrDump();
-  dumpDest(Func);
-  Str << " = pshufd." << getDest()->getType() << " ";
-  dumpSources(Func);
-}
-
 void InstX8632Ret::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  Str << "\tret\n";
@@ -1169,25 +1218,6 @@ void InstX8632Ret::dump(const Cfg *Func) const {
  dumpSources(Func);
 }

-void InstX8632Shufps::emit(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(getSrcSize() == 3);
-  Str << "\tshufps\t";
-  getDest()->emit(Func);
-  Str << ", ";
-  getSrc(1)->emit(Func);
-  Str << ", ";
-  getSrc(2)->emit(Func);
-  Str << "\n";
-}
-
-void InstX8632Shufps::dump(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrDump();
-  dumpDest(Func);
-  Str << " = shufps." << getDest()->getType() << " ";
-  dumpSources(Func);
-}
-
 void InstX8632Xadd::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  if (Locked) {

--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -66,23 +66,23 @@
  X(Br_p,        "p",  "jp")   \
 //#define X(tag, dump, emit)

-#define ICETYPEX8632_TABLE                          \
-  /* tag,          cvt, sdss,  pack, width */       \
-  X(IceType_void,  "?",  ""  , "" ,  "???")         \
-  X(IceType_i1,    "si", ""  , "" ,  "byte ptr")    \
-  X(IceType_i8,    "si", ""  , "" ,  "byte ptr")    \
-  X(IceType_i16,   "si", ""  , "" ,  "word ptr")    \
-  X(IceType_i32,   "si", ""  , "" ,  "dword ptr")   \
-  X(IceType_i64,   "si", ""  , "" ,  "qword ptr")   \
-  X(IceType_f32,   "ss", "ss", "" ,  "dword ptr")   \
-  X(IceType_f64,   "sd", "sd", "" ,  "qword ptr")   \
-  X(IceType_v4i1,  "?",  ""  , "" ,  "xmmword ptr") \
-  X(IceType_v8i1,  "?",  ""  , "" ,  "xmmword ptr") \
-  X(IceType_v16i1, "?",  ""  , "" ,  "xmmword ptr") \
-  X(IceType_v16i8, "?",  ""  , "b",  "xmmword ptr") \
-  X(IceType_v8i16, "?",  ""  , "w",  "xmmword ptr") \
-  X(IceType_v4i32, "dq", ""  , "d",  "xmmword ptr") \
-  X(IceType_v4f32, "ps", ""  , "" ,  "xmmword ptr") \
-//#define X(tag, cvt, sdss, width)
+#define ICETYPEX8632_TABLE                                        \
+  /* tag,          element type, cvt, sdss,  pack, width */       \
+  X(IceType_void,  IceType_void, "?" , ""  , "" ,  "???")         \
+  X(IceType_i1,    IceType_void, "si", ""  , "" ,  "byte ptr")    \
+  X(IceType_i8,    IceType_void, "si", ""  , "" ,  "byte ptr")    \
+  X(IceType_i16,   IceType_void, "si", ""  , "" ,  "word ptr")    \
+  X(IceType_i32,   IceType_void, "si", ""  , "" ,  "dword ptr")   \
+  X(IceType_i64,   IceType_void, "si", ""  , "" ,  "qword ptr")   \
+  X(IceType_f32,   IceType_void, "ss", "ss", "" ,  "dword ptr")   \
+  X(IceType_f64,   IceType_void, "sd", "sd", "" ,  "qword ptr")   \
+  X(IceType_v4i1,  IceType_i32 , "?" , ""  , "" ,  "xmmword ptr") \
+  X(IceType_v8i1,  IceType_i16 , "?" , ""  , "" ,  "xmmword ptr") \
+  X(IceType_v16i1, IceType_i8  , "?" , ""  , "" ,  "xmmword ptr") \
+  X(IceType_v16i8, IceType_i8  , "?" , ""  , "b",  "xmmword ptr") \
+  X(IceType_v8i16, IceType_i16 , "?" , ""  , "w",  "xmmword ptr") \
+  X(IceType_v4i32, IceType_i32 , "dq", ""  , "d",  "xmmword ptr") \
+  X(IceType_v4f32, IceType_f32 , "ps", ""  , "" ,  "xmmword ptr") \
+//#define X(tag, elementty, cvt, sdss, width)

 #endif // SUBZERO_SRC_ICEINSTX8632_DEF
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -156,11 +156,14 @@ public:
    Idiv,
    Imul,
    Label,
+    Lea,
    Load,
    Mfence,
    Mov,
+    Movd,
    Movp,
    Movq,
+    Movss,
    Movsx,
    Movzx,
    Mul,
@@ -172,6 +175,8 @@ public:
    Pand,
    Pcmpeq,
    Pcmpgt,
+    Pextrw,
+    Pinsrw,
    Pmullw,
    Pmuludq,
    Pop,
@@ -430,7 +435,11 @@ public:
    Ostream &Str = Func->getContext()->getStrEmit();
    assert(getSrcSize() == 3);
    Str << "\t" << Opcode << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
    getSrc(1)->emit(Func);
+    Str << ", ";
+    getSrc(2)->emit(Func);
    Str << "\n";
  }
  virtual void dump(const Cfg *Func) const {
@@ -454,8 +463,54 @@ private:
  static const char *Opcode;
 };

+// Instructions of the form x := y op z
+template <InstX8632::InstKindX8632 K>
+class InstX8632ThreeAddressop : public InstX8632 {
+public:
+  static InstX8632ThreeAddressop *create(Cfg *Func, Variable *Dest,
+                                         Operand *Source0, Operand *Source1) {
+    return new (Func->allocate<InstX8632ThreeAddressop>())
+        InstX8632ThreeAddressop(Func, Dest, Source0, Source1);
+  }
+  virtual void emit(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(getSrcSize() == 2);
+    Str << "\t" << Opcode << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
+    getSrc(0)->emit(Func);
+    Str << ", ";
+    getSrc(1)->emit(Func);
+    Str << "\n";
+  }
+  virtual void dump(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstX8632ThreeAddressop(Cfg *Func, Variable *Dest, Operand *Source0,
+                          Operand *Source1)
+      : InstX8632(Func, K, 2, Dest) {
+    addSource(Source0);
+    addSource(Source1);
+  }
+  InstX8632ThreeAddressop(const InstX8632ThreeAddressop &)
+      LLVM_DELETED_FUNCTION;
+  InstX8632ThreeAddressop &
+  operator=(const InstX8632ThreeAddressop &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632ThreeAddressop() {}
+  static const char *Opcode;
+};
+
 typedef InstX8632Unaryop<InstX8632::Bsf> InstX8632Bsf;
 typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr;
+typedef InstX8632Unaryop<InstX8632::Lea> InstX8632Lea;
+typedef InstX8632Unaryop<InstX8632::Movd> InstX8632Movd;
+typedef InstX8632Unaryop<InstX8632::Movss> InstX8632Movss;
 typedef InstX8632Unaryop<InstX8632::Sqrtss> InstX8632Sqrtss;
 typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
 typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
@@ -489,6 +544,10 @@ typedef InstX8632Binop<InstX8632::Pcmpeq> InstX8632Pcmpeq;
 typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt;
 typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
 typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
+typedef InstX8632Ternop<InstX8632::Pinsrw> InstX8632Pinsrw;
+typedef InstX8632Ternop<InstX8632::Shufps> InstX8632Shufps;
+typedef InstX8632ThreeAddressop<InstX8632::Pextrw> InstX8632Pextrw;
+typedef InstX8632ThreeAddressop<InstX8632::Pshufd> InstX8632Pshufd;

 // Base class for a lockable x86-32 instruction (emits a locked prefix).
 class InstX8632Lockable : public InstX8632 {
@@ -994,27 +1053,6 @@ private:
  virtual ~InstX8632Push() {}
 };

-// Pshufd - shuffle a vector of doublewords 
-class InstX8632Pshufd : public InstX8632 {
-public:
-  static InstX8632Pshufd *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                 Operand *Source2) {
-    return new (Func->allocate<InstX8632Pshufd>())
-        InstX8632Pshufd(Func, Dest, Source1, Source2);
-  }
-  virtual void emit(const Cfg *Func) const;
-  virtual void dump(const Cfg *Func) const;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Pshufd); }
-
-private:
-  InstX8632Pshufd(Cfg *Func, Variable *Dest, Operand *Source1,
-                  Operand *Source2);
-  InstX8632Pshufd(const InstX8632Pshufd &) LLVM_DELETED_FUNCTION;
-  InstX8632Pshufd &operator=(const InstX8632Pshufd &) LLVM_DELETED_FUNCTION;
-  virtual ~InstX8632Pshufd() {}
-  static const char *Opcode;
-};
-
 // Ret instruction.  Currently only supports the "ret" version that
 // does not pop arguments.  This instruction takes a Source operand
 // (for non-void returning functions) for liveness analysis, though
@@ -1035,27 +1073,6 @@ private:
  virtual ~InstX8632Ret() {}
 };

-// Shufps - select from two vectors of floating point values
-class InstX8632Shufps : public InstX8632 {
-public:
-  static InstX8632Shufps *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                 Operand *Source2) {
-    return new (Func->allocate<InstX8632Shufps>())
-        InstX8632Shufps(Func, Dest, Source1, Source2);
-  }
-  virtual void emit(const Cfg *Func) const;
-  virtual void dump(const Cfg *Func) const;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Shufps); }
-
-private:
-  InstX8632Shufps(Cfg *Func, Variable *Dest, Operand *Source1,
-                  Operand *Source2);
-  InstX8632Shufps(const InstX8632Shufps &) LLVM_DELETED_FUNCTION;
-  InstX8632Shufps &operator=(const InstX8632Shufps &) LLVM_DELETED_FUNCTION;
-  virtual ~InstX8632Shufps() {}
-  static const char *Opcode;
-};
-
 // Exchanging Add instruction.  Exchanges the first operand (destination
 // operand) with the second operand (source operand), then loads the sum
 // of the two values into the destination operand. The destination may be

--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -110,12 +110,18 @@ void TargetLowering::lower() {
  case Inst::Cast:
    lowerCast(llvm::dyn_cast<InstCast>(Inst));
    break;
+  case Inst::ExtractElement:
+    lowerExtractElement(llvm::dyn_cast<InstExtractElement>(Inst));
+    break;
  case Inst::Fcmp:
    lowerFcmp(llvm::dyn_cast<InstFcmp>(Inst));
    break;
  case Inst::Icmp:
    lowerIcmp(llvm::dyn_cast<InstIcmp>(Inst));
    break;
+  case Inst::InsertElement:
+    lowerInsertElement(llvm::dyn_cast<InstInsertElement>(Inst));
+    break;
  case Inst::IntrinsicCall:
    lowerIntrinsicCall(llvm::dyn_cast<InstIntrinsicCall>(Inst));
    break;

--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -169,7 +169,9 @@ protected:
  virtual void lowerCall(const InstCall *Inst) = 0;
  virtual void lowerCast(const InstCast *Inst) = 0;
  virtual void lowerFcmp(const InstFcmp *Inst) = 0;
+  virtual void lowerExtractElement(const InstExtractElement *Inst) = 0;
  virtual void lowerIcmp(const InstIcmp *Inst) = 0;
+  virtual void lowerInsertElement(const InstInsertElement *Inst) = 0;
  virtual void lowerIntrinsicCall(const InstIntrinsicCall *Inst) = 0;
  virtual void lowerLoad(const InstLoad *Inst) = 0;
  virtual void lowerPhi(const InstPhi *Inst) = 0;

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -82,9 +82,11 @@ protected:
  virtual void lowerBr(const InstBr *Inst);
  virtual void lowerCall(const InstCall *Inst);
  virtual void lowerCast(const InstCast *Inst);
+  virtual void lowerExtractElement(const InstExtractElement *Inst);
  virtual void lowerFcmp(const InstFcmp *Inst);
  virtual void lowerIcmp(const InstIcmp *Inst);
  virtual void lowerIntrinsicCall(const InstIntrinsicCall *Inst);
+  virtual void lowerInsertElement(const InstInsertElement *Inst);
  virtual void lowerLoad(const InstLoad *Inst);
  virtual void lowerPhi(const InstPhi *Inst);
  virtual void lowerRet(const InstRet *Inst);
@@ -152,6 +154,10 @@ protected:
  Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
  Variable *makeVectorOfOnes(Type Ty, int32_t RegNum = Variable::NoRegister);

+  // Return a memory operand corresponding to a stack allocated Variable.
+  OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
+                                                uint32_t Offset = 0);
+
  // The following are helpers that insert lowered x86 instructions
  // with minimal syntactic overhead, so that the lowering code can
  // look as close to assembly as practical.
@@ -237,6 +243,9 @@ protected:
  void _imul(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Imul::create(Func, Dest, Src0));
  }
+  void _lea(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Lea::create(Func, Dest, Src0));
+  }
  void _mfence() { Context.insert(InstX8632Mfence::create(Func)); }
  // If Dest=NULL is passed in, then a new variable is created, marked
  // as infinite register allocation weight, and returned through the
@@ -249,12 +258,18 @@ protected:
      Context.insert(InstX8632Mov::create(Func, Dest, Src0));
    }
  }
+  void _movd(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movd::create(Func, Dest, Src0));
+  }
  void _movp(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Movp::create(Func, Dest, Src0));
  }
  void _movq(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Movq::create(Func, Dest, Src0));
  }
+  void _movss(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movss::create(Func, Dest, Src0));
+  }
  void _movsx(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Movsx::create(Func, Dest, Src0));
  }
@@ -288,6 +303,12 @@ protected:
  void _pcmpgt(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Pcmpgt::create(Func, Dest, Src0));
  }
+  void _pextrw(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pextrw::create(Func, Dest, Src0, Src1));
+  }
+  void _pinsrw(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pinsrw::create(Func, Dest, Src0, Src1));
+  }
  void _pmullw(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Pmullw::create(Func, Dest, Src0));
  }

--- a/tests_lit/llvm2ice_tests/vector-ops.ll
+++ b/tests_lit/llvm2ice_tests/vector-ops.ll
+; This checks support for insertelement and extractelement.
+
+; RUN: %llvm2ice --verbose inst %s | FileCheck %s
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+
+; insertelement operations
+
+define <4 x float> @insertelement_v4f32(<4 x float> %vec, float %elt) {
+entry:
+  %res = insertelement <4 x float> %vec, float %elt, i32 1
+  ret <4 x float> %res
+; CHECK-LABEL: insertelement_v4f32:
+; CHECK: shufps
+; CHECK: shufps
+}
+
+define <4 x i32> @insertelement_v4i32(<4 x i32> %vec, i32 %elt) {
+entry:
+  %res = insertelement <4 x i32> %vec, i32 %elt, i32 1
+  ret <4 x i32> %res
+; CHECK-LABEL: insertelement_v4i32:
+; CHECK: shufps
+; CHECK: shufps
+}
+
+define <8 x i16> @insertelement_v8i16(<8 x i16> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i16
+  %res = insertelement <8 x i16> %vec, i16 %elt, i32 1
+  ret <8 x i16> %res
+; CHECK-LABEL: insertelement_v8i16
+; CHECK: pinsrw
+}
+
+define <16 x i8> @insertelement_v16i8(<16 x i8> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i8
+  %res = insertelement <16 x i8> %vec, i8 %elt, i32 1
+  ret <16 x i8> %res
+; CHECK-LABEL: insertelement_v16i8:
+; CHECK: movups
+; CHECK: lea
+; CHECK: mov
+}
+
+define <4 x i1> @insertelement_v4i1(<4 x i1> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i1
+  %res = insertelement <4 x i1> %vec, i1 %elt, i32 1
+  ret <4 x i1> %res
+; CHECK-LABEL: insertelement_v4i1:
+; CHECK: shufps
+; CHECK: shufps
+}
+
+define <8 x i1> @insertelement_v8i1(<8 x i1> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i1
+  %res = insertelement <8 x i1> %vec, i1 %elt, i32 1
+  ret <8 x i1> %res
+; CHECK-LABEL: insertelement_v8i1:
+; CHECK: pinsrw
+}
+
+define <16 x i1> @insertelement_v16i1(<16 x i1> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i1
+  %res = insertelement <16 x i1> %vec, i1 %elt, i32 1
+  ret <16 x i1> %res
+; CHECK-LABEL: insertelement_v16i1:
+; CHECK: movups
+; CHECK: lea
+; CHECK: mov
+}
+
+; extractelement operations
+
+define float @extractelement_v4f32(<4 x float> %vec) {
+entry:
+  %res = extractelement <4 x float> %vec, i32 1
+  ret float %res
+; CHECK-LABEL: extractelement_v4f32:
+; CHECK: pshufd
+}
+
+define i32 @extractelement_v4i32(<4 x i32> %vec) {
+entry:
+  %res = extractelement <4 x i32> %vec, i32 1
+  ret i32 %res
+; CHECK-LABEL: extractelement_v4i32:
+; CHECK: pshufd
+}
+
+define i32 @extractelement_v8i16(<8 x i16> %vec) {
+entry:
+  %res = extractelement <8 x i16> %vec, i32 1
+  %res.ext = zext i16 %res to i32
+  ret i32 %res.ext
+; CHECK-LABEL: extractelement_v8i16:
+; CHECK: pextrw
+}
+
+define i32 @extractelement_v16i8(<16 x i8> %vec) {
+entry:
+  %res = extractelement <16 x i8> %vec, i32 1
+  %res.ext = zext i8 %res to i32
+  ret i32 %res.ext
+; CHECK-LABEL: extractelement_v16i8:
+; CHECK: movups
+; CHECK: lea
+; CHECK: mov
+}
+
+define i32 @extractelement_v4i1(<4 x i1> %vec) {
+entry:
+  %res = extractelement <4 x i1> %vec, i32 1
+  %res.ext = zext i1 %res to i32
+  ret i32 %res.ext
+; CHECK-LABEL: extractelement_v4i1:
+; CHECK: pshufd
+}
+
+define i32 @extractelement_v8i1(<8 x i1> %vec) {
+entry:
+  %res = extractelement <8 x i1> %vec, i32 1
+  %res.ext = zext i1 %res to i32
+  ret i32 %res.ext
+; CHECK-LABEL: extractelement_v8i1:
+; CHECK: pextrw
+}
+
+define i32 @extractelement_v16i1(<16 x i1> %vec) {
+entry:
+  %res = extractelement <16 x i1> %vec, i32 1
+  %res.ext = zext i1 %res to i32
+  ret i32 %res.ext
+; CHECK-LABEL: extractelement_v16i1:
+; CHECK: movups
+; CHECK: lea
+; CHECK: mov
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ