Lower casting operations that involve vector types.

Impacted instructions: bitcast {v4f32, v4i32, v8i16, v16i8} <-> {v4f32, v4i32, v8i16, v16i8} bitcast v8i1 <-> i8 bitcast v16i1 <-> i16 (There was already code present to handle trivial bitcasts like v16i1 <-> v16i1.) [sz]ext v4i1 -> v4i32 [sz]ext v8i1 -> v8i16 [sz]ext v16i1 -> v16i8 trunc v4i32 -> v4i1 trunc v8i16 -> v8i1 trunc v16i8 -> v16i1 [su]itofp v4i32 -> v4f32 fpto[su]i v4f32 -> v4i32 Where there is a relatively simple lowering to x86 instructions, it has been used. Otherwise a helper call is used. Some lowerings require a materialization of a integer vector with 1s in each entry. Since there is no support for vector constant pools, the constant is materialized purely through register operations. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/383303003

Lower casting operations that involve vector types.
83b8036b · Matt Wala · e4da26f6 · 83b8036b · 83b8036b · 83b8036b
Commit 83b8036b authored Jul 16, 2014 by Matt Wala
7 changed files
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -39,10 +39,11 @@ const size_t InstX8632BrAttributesSize =
 const struct TypeX8632Attributes_ {
  const char *CvtString;   // i (integer), s (single FP), d (double FP)
  const char *SdSsString;  // ss, sd, or <blank>
+  const char *PackString;  // b, w, d, or <blank>
  const char *WidthString; // {byte,word,dword,qword} ptr
 } TypeX8632Attributes[] = {
-#define X(tag, cvt, sdss, width)                                               \
+#define X(tag, cvt, sdss, pack, width)                                         \
-  { cvt, "" sdss, width }                                                      \
+  { cvt, "" sdss, pack, width }                                                \
  ,
    ICETYPEX8632_TABLE
 #undef X
@@ -448,8 +449,10 @@ template <> const char *InstX8632Addss::Opcode = "addss";
 template <> const char *InstX8632Sub::Opcode = "sub";
 template <> const char *InstX8632Subps::Opcode = "subps";
 template <> const char *InstX8632Subss::Opcode = "subss";
+template <> const char *InstX8632Psub::Opcode = "psub";
 template <> const char *InstX8632Sbb::Opcode = "sbb";
 template <> const char *InstX8632And::Opcode = "and";
+template <> const char *InstX8632Pand::Opcode = "pand";
 template <> const char *InstX8632Or::Opcode = "or";
 template <> const char *InstX8632Xor::Opcode = "xor";
 template <> const char *InstX8632Pxor::Opcode = "pxor";
@@ -461,8 +464,12 @@ template <> const char *InstX8632Divps::Opcode = "divps";
 template <> const char *InstX8632Idiv::Opcode = "idiv";
 template <> const char *InstX8632Divss::Opcode = "divss";
 template <> const char *InstX8632Shl::Opcode = "shl";
+template <> const char *InstX8632Psll::Opcode = "psll";
 template <> const char *InstX8632Shr::Opcode = "shr";
 template <> const char *InstX8632Sar::Opcode = "sar";
+template <> const char *InstX8632Psra::Opcode = "psra";
+template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
+template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
@@ -690,7 +697,7 @@ void InstX8632Cmpxchg8b::dump(const Cfg *Func) const {
 void InstX8632Cvt::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 1);
-  Str << "\tcvts" << TypeX8632Attributes[getSrc(0)->getType()].CvtString << "2s"
+  Str << "\tcvt" << TypeX8632Attributes[getSrc(0)->getType()].CvtString << "2"
      << TypeX8632Attributes[getDest()->getType()].CvtString << "\t";
  getDest()->emit(Func);
  Str << ", ";
@@ -701,8 +708,8 @@ void InstX8632Cvt::emit(const Cfg *Func) const {
 void InstX8632Cvt::dump(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrDump();
  dumpDest(Func);
-  Str << " = cvts" << TypeX8632Attributes[getSrc(0)->getType()].CvtString
+  Str << " = cvt" << TypeX8632Attributes[getSrc(0)->getType()].CvtString
-      << "2s" << TypeX8632Attributes[getDest()->getType()].CvtString << " ";
+      << "2" << TypeX8632Attributes[getDest()->getType()].CvtString << " ";
  dumpSources(Func);
 }
@@ -1000,6 +1007,20 @@ void InstX8632Fstp::dump(const Cfg *Func) const {
  Str << "\n";
 }
+template <> void InstX8632Pcmpeq::emit(const Cfg *Func) const {
+  char buf[30];
+  snprintf(buf, llvm::array_lengthof(buf), "pcmpeq%s",
+           TypeX8632Attributes[getDest()->getType()].PackString);
+  emitTwoAddress(buf, this, Func);
+}
+template <> void InstX8632Pcmpgt::emit(const Cfg *Func) const {
+  char buf[30];
+  snprintf(buf, llvm::array_lengthof(buf), "pcmpgt%s",
+           TypeX8632Attributes[getDest()->getType()].PackString);
+  emitTwoAddress(buf, this, Func);
+}
 void InstX8632Pop::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 0);
@@ -1054,6 +1075,31 @@ void InstX8632Push::dump(const Cfg *Func) const {
  dumpSources(Func);
 }
+template <> void InstX8632Psll::emit(const Cfg *Func) const {
+  assert(getDest()->getType() == IceType_v8i16 ||
+         getDest()->getType() == IceType_v4i32);
+  char buf[30];
+  snprintf(buf, llvm::array_lengthof(buf), "psll%s",
+           TypeX8632Attributes[getDest()->getType()].PackString);
+  emitTwoAddress(buf, this, Func);
+}
+template <> void InstX8632Psra::emit(const Cfg *Func) const {
+  assert(getDest()->getType() == IceType_v8i16 ||
+         getDest()->getType() == IceType_v4i32);
+  char buf[30];
+  snprintf(buf, llvm::array_lengthof(buf), "psra%s",
+           TypeX8632Attributes[getDest()->getType()].PackString);
+  emitTwoAddress(buf, this, Func);
+}
+template <> void InstX8632Psub::emit(const Cfg *Func) const {
+  char buf[30];
+  snprintf(buf, llvm::array_lengthof(buf), "psub%s",
+           TypeX8632Attributes[getDest()->getType()].PackString);
+  emitTwoAddress(buf, this, Func);
+}
 void InstX8632Ret::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  Str << "\tret\n";

--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -66,23 +66,23 @@
  X(Br_p,        "p",  "jp")   \
 //#define X(tag, dump, emit)
-#define ICETYPEX8632_TABLE                   \
+#define ICETYPEX8632_TABLE                          \
-  /* tag,         cvt, sdss, width */        \
+  /* tag,          cvt, sdss,  pack, width */       \
-  X(IceType_void,  "?", ""  , "???")         \
+  X(IceType_void,  "?",  ""  , "" ,  "???")         \
-  X(IceType_i1,    "i", ""  , "byte ptr")    \
+  X(IceType_i1,    "si", ""  , "" ,  "byte ptr")    \
-  X(IceType_i8,    "i", ""  , "byte ptr")    \
+  X(IceType_i8,    "si", ""  , "" ,  "byte ptr")    \
-  X(IceType_i16,   "i", ""  , "word ptr")    \
+  X(IceType_i16,   "si", ""  , "" ,  "word ptr")    \
-  X(IceType_i32,   "i", ""  , "dword ptr")   \
+  X(IceType_i32,   "si", ""  , "" ,  "dword ptr")   \
-  X(IceType_i64,   "i", ""  , "qword ptr")   \
+  X(IceType_i64,   "si", ""  , "" ,  "qword ptr")   \
-  X(IceType_f32,   "s", "ss", "dword ptr")   \
+  X(IceType_f32,   "ss", "ss", "" ,  "dword ptr")   \
-  X(IceType_f64,   "d", "sd", "qword ptr")   \
+  X(IceType_f64,   "sd", "sd", "" ,  "qword ptr")   \
-  X(IceType_v4i1,  "?", ""  , "xmmword ptr") \
+  X(IceType_v4i1,  "?",  ""  , "" ,  "xmmword ptr") \
-  X(IceType_v8i1,  "?", ""  , "xmmword ptr") \
+  X(IceType_v8i1,  "?",  ""  , "" ,  "xmmword ptr") \
-  X(IceType_v16i1, "?", ""  , "xmmword ptr") \
+  X(IceType_v16i1, "?",  ""  , "" ,  "xmmword ptr") \
-  X(IceType_v16i8, "?", ""  , "xmmword ptr") \
+  X(IceType_v16i8, "?",  ""  , "b",  "xmmword ptr") \
-  X(IceType_v8i16, "?", ""  , "xmmword ptr") \
+  X(IceType_v8i16, "?",  ""  , "w",  "xmmword ptr") \
-  X(IceType_v4i32, "?", ""  , "xmmword ptr") \
+  X(IceType_v4i32, "dq", ""  , "d",  "xmmword ptr") \
-  X(IceType_v4f32, "?", ""  , "xmmword ptr") \
+  X(IceType_v4f32, "ps", ""  , "",   "xmmword ptr") \
 //#define X(tag, cvt, sdss, width)
 #endif // SUBZERO_SRC_ICEINSTX8632_DEF
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -168,8 +168,14 @@ public:
    Mulss,
    Neg,
    Or,
+    Pand,
+    Pcmpeq,
+    Pcmpgt,
    Pop,
    Push,
+    Psll,
+    Psra,
+    Psub,
    Pxor,
    Ret,
    Sar,
@@ -453,7 +459,9 @@ typedef InstX8632Binop<InstX8632::Sub> InstX8632Sub;
 typedef InstX8632Binop<InstX8632::Subps> InstX8632Subps;
 typedef InstX8632Binop<InstX8632::Subss> InstX8632Subss;
 typedef InstX8632Binop<InstX8632::Sbb> InstX8632Sbb;
+typedef InstX8632Binop<InstX8632::Psub> InstX8632Psub;
 typedef InstX8632Binop<InstX8632::And> InstX8632And;
+typedef InstX8632Binop<InstX8632::Pand> InstX8632Pand;
 typedef InstX8632Binop<InstX8632::Or> InstX8632Or;
 typedef InstX8632Binop<InstX8632::Xor> InstX8632Xor;
 typedef InstX8632Binop<InstX8632::Pxor> InstX8632Pxor;
@@ -463,8 +471,12 @@ typedef InstX8632Binop<InstX8632::Mulss> InstX8632Mulss;
 typedef InstX8632Binop<InstX8632::Divps> InstX8632Divps;
 typedef InstX8632Binop<InstX8632::Divss> InstX8632Divss;
 typedef InstX8632Binop<InstX8632::Shl, true> InstX8632Shl;
+typedef InstX8632Binop<InstX8632::Psll> InstX8632Psll;
 typedef InstX8632Binop<InstX8632::Shr, true> InstX8632Shr;
 typedef InstX8632Binop<InstX8632::Sar, true> InstX8632Sar;
+typedef InstX8632Binop<InstX8632::Psra> InstX8632Psra;
+typedef InstX8632Binop<InstX8632::Pcmpeq> InstX8632Pcmpeq;
+typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt;
 typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
 typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -87,6 +87,8 @@ InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
 // The maximum number of arguments to pass in XMM registers
 const unsigned X86_MAX_XMM_ARGS = 4;
+// The number of bits in a byte
+const unsigned X86_CHAR_BIT = 8;
 // In some cases, there are x-macros tables for both high-level and
 // low-level instructions/operands that use the same enum key value.
@@ -157,7 +159,7 @@ void xMacroIntegrityCheck() {
    // Define a temporary set of enum values based on low-level
    // table entries.
    enum _tmp_enum {
-#define X(tag, cvt, sdss, width) _tmp_##tag,
+#define X(tag, cvt, sdss, pack, width) _tmp_##tag,
      ICETYPEX8632_TABLE
 #undef X
          _num
@@ -169,7 +171,7 @@ void xMacroIntegrityCheck() {
 #undef X
 // Define a set of constants based on low-level table entries,
 // and ensure the table entry keys are consistent.
-#define X(tag, cvt, sdss, width)                                               \
+#define X(tag, cvt, sdss, pack, width)                                         \
  static const int _table2_##tag = _tmp_##tag;                                 \
  STATIC_ASSERT(_table1_##tag == _table2_##tag);
    ICETYPEX8632_TABLE;
@@ -1573,6 +1575,28 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
      _mov(T_Hi, T_Lo);
      _sar(T_Hi, Shift);
      _mov(DestHi, T_Hi);
+    } else if (isVectorType(Dest->getType())) {
+      Type DestTy = Dest->getType();
+      if (DestTy == IceType_v16i8) {
+        // onemask = materialize(1,1,...); dst = (src & onemask) > 0
+        Variable *OneMask = makeVectorOfOnes(Dest->getType());
+        Variable *T = makeReg(DestTy);
+        _movp(T, Src0RM);
+        _pand(T, OneMask);
+        Variable *Zeros = makeVectorOfZeros(Dest->getType());
+        _pcmpgt(T, Zeros);
+        _movp(Dest, T);
+      } else {
+        // width = width(elty) - 1; dest = (src << width) >> width
+        SizeT ShiftAmount =
+            X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) - 1;
+        Constant *ShiftConstant = Ctx->getConstantInt(IceType_i8, ShiftAmount);
+        Variable *T = makeReg(DestTy);
+        _movp(T, Src0RM);
+        _psll(T, ShiftConstant);
+        _psra(T, ShiftConstant);
+        _movp(Dest, T);
+      }
    } else {
      // TODO: Sign-extend an i1 via "shl reg, 31; sar reg, 31", and
      // also copy to the high operand of a 64-bit variable.
@@ -1604,6 +1628,14 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
      _movzx(T, Src0RM);
      _and(T, One);
      _mov(Dest, T);
+    } else if (isVectorType(Dest->getType())) {
+      // onemask = materialize(1,1,...); dest = onemask & src
+      Type DestTy = Dest->getType();
+      Variable *OneMask = makeVectorOfOnes(DestTy);
+      Variable *T = makeReg(DestTy);
+      _movp(T, Src0RM);
+      _pand(T, OneMask);
+      _movp(Dest, T);
    } else {
      // t1 = movzx src; dst = t1
      Variable *T = makeReg(Dest->getType());
@@ -1613,14 +1645,25 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
    break;
  }
  case InstCast::Trunc: {
-    Operand *Src0 = Inst->getSrc(0);
+    if (isVectorType(Dest->getType())) {
-    if (Src0->getType() == IceType_i64)
+      // onemask = materialize(1,1,...); dst = src & onemask
-      Src0 = loOperand(Src0);
+      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      Type Src0Ty = Src0RM->getType();
-    // t1 = trunc Src0RM; Dest = t1
+      Variable *OneMask = makeVectorOfOnes(Src0Ty);
-    Variable *T = NULL;
+      Variable *T = makeReg(Dest->getType());
-    _mov(T, Src0RM);
+      _movp(T, Src0RM);
-    _mov(Dest, T);
+      _pand(T, OneMask);
+      _movp(Dest, T);
+    } else {
+      Operand *Src0 = Inst->getSrc(0);
+      if (Src0->getType() == IceType_i64)
+        Src0 = loOperand(Src0);
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      // t1 = trunc Src0RM; Dest = t1
+      Variable *T = NULL;
+      _mov(T, Src0RM);
+      _mov(Dest, T);
+    }
    break;
  }
  case InstCast::Fptrunc:
@@ -1633,7 +1676,14 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
    break;
  }
  case InstCast::Fptosi:
-    if (Dest->getType() == IceType_i64) {
+    if (isVectorType(Dest->getType())) {
+      assert(Dest->getType() == IceType_v4i32 &&
+             Inst->getSrc(0)->getType() == IceType_v4f32);
+      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+      Variable *T = makeReg(Dest->getType());
+      _cvt(T, Src0RM);
+      _movp(Dest, T);
+    } else if (Dest->getType() == IceType_i64) {
      // Use a helper for converting floating-point values to 64-bit
      // integers.  SSE2 appears to have no way to convert from xmm
      // registers to something like the edx:eax register pair, and
@@ -1660,7 +1710,15 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
    }
    break;
  case InstCast::Fptoui:
-    if (Dest->getType() == IceType_i64 || Dest->getType() == IceType_i32) {
+    if (isVectorType(Dest->getType())) {
+      assert(Dest->getType() == IceType_v4i32 &&
+             Inst->getSrc(0)->getType() == IceType_v4f32);
+      const SizeT MaxSrcs = 1;
+      InstCall *Call = makeHelperCall("Sz_fptoui_v4f32", Dest, MaxSrcs);
+      Call->addArg(Inst->getSrc(0));
+      lowerCall(Call);
+    } else if (Dest->getType() == IceType_i64 ||
+               Dest->getType() == IceType_i32) {
      // Use a helper for both x86-32 and x86-64.
      split64(Dest);
      const SizeT MaxSrcs = 1;
@@ -1687,7 +1745,14 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
    }
    break;
  case InstCast::Sitofp:
-    if (Inst->getSrc(0)->getType() == IceType_i64) {
+    if (isVectorType(Dest->getType())) {
+      assert(Dest->getType() == IceType_v4f32 &&
+             Inst->getSrc(0)->getType() == IceType_v4i32);
+      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+      Variable *T = makeReg(Dest->getType());
+      _cvt(T, Src0RM);
+      _movp(Dest, T);
+    } else if (Inst->getSrc(0)->getType() == IceType_i64) {
      // Use a helper for x86-32.
      const SizeT MaxSrcs = 1;
      Type DestType = Dest->getType();
@@ -1713,7 +1778,15 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
    break;
  case InstCast::Uitofp: {
    Operand *Src0 = Inst->getSrc(0);
-    if (Src0->getType() == IceType_i64 || Src0->getType() == IceType_i32) {
+    if (isVectorType(Src0->getType())) {
+      assert(Dest->getType() == IceType_v4f32 &&
+             Src0->getType() == IceType_v4i32);
+      const SizeT MaxSrcs = 1;
+      InstCall *Call = makeHelperCall("Sz_uitofp_v4i32", Dest, MaxSrcs);
+      Call->addArg(Src0);
+      lowerCall(Call);
+    } else if (Src0->getType() == IceType_i64 ||
+               Src0->getType() == IceType_i32) {
      // Use a helper for x86-32 and x86-64.  Also use a helper for
      // i32 on x86-32.
      const SizeT MaxSrcs = 1;
@@ -1752,6 +1825,18 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
    switch (Dest->getType()) {
    default:
      llvm_unreachable("Unexpected Bitcast dest type");
+    case IceType_i8: {
+      assert(Src0->getType() == IceType_v8i1);
+      InstCall *Call = makeHelperCall("Sz_bitcast_v8i1_to_i8", Dest, 1);
+      Call->addArg(Src0);
+      lowerCall(Call);
+    } break;
+    case IceType_i16: {
+      assert(Src0->getType() == IceType_v16i1);
+      InstCall *Call = makeHelperCall("Sz_bitcast_v16i1_to_i16", Dest, 1);
+      Call->addArg(Src0);
+      lowerCall(Call);
+    } break;
    case IceType_i32:
    case IceType_f32: {
      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
@@ -1830,6 +1915,30 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
      _store(T_Hi, SpillHi);
      _movq(Dest, Spill);
    } break;
+    case IceType_v8i1: {
+      assert(Src0->getType() == IceType_i8);
+      InstCall *Call = makeHelperCall("Sz_bitcast_i8_to_v8i1", Dest, 1);
+      Variable *Src0AsI32 = Func->makeVariable(IceType_i32, Context.getNode());
+      // Arguments to functions are required to be at least 32 bits wide.
+      lowerCast(InstCast::create(Func, InstCast::Zext, Src0AsI32, Src0));
+      Call->addArg(Src0AsI32);
+      lowerCall(Call);
+    } break;
+    case IceType_v16i1: {
+      assert(Src0->getType() == IceType_i16);
+      InstCall *Call = makeHelperCall("Sz_bitcast_i16_to_v16i1", Dest, 1);
+      Variable *Src0AsI32 = Func->makeVariable(IceType_i32, Context.getNode());
+      // Arguments to functions are required to be at least 32 bits wide.
+      lowerCast(InstCast::create(Func, InstCast::Zext, Src0AsI32, Src0));
+      Call->addArg(Src0AsI32);
+      lowerCall(Call);
+    } break;
+    case IceType_v8i16:
+    case IceType_v16i8:
+    case IceType_v4i32:
+    case IceType_v4f32: {
+      _movp(Dest, legalizeToVar(Src0));
+    } break;
    }
    break;
  }
@@ -2875,6 +2984,29 @@ void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) {
  lowerCall(Call);
 }
+Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) {
+  // There is no support for loading or emitting vector constants, so
+  // this value is initialized using register operations.
+  Variable *Reg = makeReg(Ty, RegNum);
+  // Insert a FakeDef, since otherwise the live range of Reg might
+  // be overestimated.
+  Context.insert(InstFakeDef::create(Func, Reg));
+  _pxor(Reg, Reg);
+  return Reg;
+}
+Variable *TargetX8632::makeVectorOfOnes(Type Ty, int32_t RegNum) {
+  // There is no support for loading or emitting vector constants, so
+  // this value is initialized using register operations.
+  Variable *Dest = makeVectorOfZeros(Ty, RegNum);
+  Variable *MinusOne = makeReg(Ty);
+  // Insert a FakeDef so the live range of MinusOne is not overestimated.
+  Context.insert(InstFakeDef::create(Func, MinusOne));
+  _pcmpeq(MinusOne, MinusOne);
+  _psub(Dest, MinusOne);
+  return Dest;
+}
 // Helper for legalize() to emit the right code to lower an operand to a
 // register of the appropriate type.
 Variable *TargetX8632::copyToReg(Operand *Src, int32_t RegNum) {
@@ -2937,19 +3069,9 @@ Operand *TargetX8632::legalize(Operand *From, LegalMask Allowed,
      // overestimated.  If the constant being lowered is a 64 bit value,
      // then the result should be split and the lo and hi components will
      // need to go in uninitialized registers.
+      if (isVectorType(From->getType()))
-      if (isVectorType(From->getType())) {
+        return makeVectorOfZeros(From->getType());
-        // There is no support for loading or emitting vector constants, so
+      From = Ctx->getConstantZero(From->getType());
-        // undef values are instead initialized in registers.
-        Variable *Reg = makeReg(From->getType(), RegNum);
-        // Insert a FakeDef, since otherwise the live range of Reg might
-        // be overestimated.
-        Context.insert(InstFakeDef::create(Func, Reg));
-        _pxor(Reg, Reg);
-        return Reg;
-      } else {
-        From = Ctx->getConstantZero(From->getType());
-      }
    }
    // There should be no constants of vector type (other than undef).
    assert(!isVectorType(From->getType()));

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -148,6 +148,10 @@ protected:
  Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
+  // Returns a vector in a register with the given constant entries.
+  Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
+  Variable *makeVectorOfOnes(Type Ty, int32_t RegNum = Variable::NoRegister);
  // The following are helpers that insert lowered x86 instructions
  // with minimal syntactic overhead, so that the lowering code can
  // look as close to assembly as practical.
@@ -272,12 +276,30 @@ protected:
  void _or(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Or::create(Func, Dest, Src0));
  }
+  void _pand(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pand::create(Func, Dest, Src0));
+  }
+  void _pcmpeq(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pcmpeq::create(Func, Dest, Src0));
+  }
+  void _pcmpgt(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pcmpgt::create(Func, Dest, Src0));
+  }
  void _pop(Variable *Dest) {
    Context.insert(InstX8632Pop::create(Func, Dest));
  }
  void _push(Operand *Src0, bool SuppressStackAdjustment = false) {
    Context.insert(InstX8632Push::create(Func, Src0, SuppressStackAdjustment));
  }
+  void _psll(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Psll::create(Func, Dest, Src0));
+  }
+  void _psra(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Psra::create(Func, Dest, Src0));
+  }
+  void _psub(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Psub::create(Func, Dest, Src0));
+  }
  void _pxor(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Pxor::create(Func, Dest, Src0));
  }

--- a/tests_lit/llvm2ice_tests/vector-bitcast.ll
+++ b/tests_lit/llvm2ice_tests/vector-bitcast.ll
+; This file tests bitcasts of vector type. For most operations, these
+; should be lowered to a no-op on -O2.
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s --check-prefix=OPTM1
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+define <16 x i8> @test_bitcast_v16i8_to_v16i8(<16 x i8> %arg) {
+entry:
+  %res = bitcast <16 x i8> %arg to <16 x i8>
+  ret <16 x i8> %res
+; CHECK-LABEL: test_bitcast_v16i8_to_v16i8:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <8 x i16> @test_bitcast_v16i8_to_v8i16(<16 x i8> %arg) {
+entry:
+  %res = bitcast <16 x i8> %arg to <8 x i16>
+  ret <8 x i16> %res
+; CHECK-LABEL: test_bitcast_v16i8_to_v8i16:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <4 x i32> @test_bitcast_v16i8_to_v4i32(<16 x i8> %arg) {
+entry:
+  %res = bitcast <16 x i8> %arg to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: test_bitcast_v16i8_to_v4i32:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <4 x float> @test_bitcast_v16i8_to_v4f32(<16 x i8> %arg) {
+entry:
+  %res = bitcast <16 x i8> %arg to <4 x float>
+  ret <4 x float> %res
+; CHECK-LABEL: test_bitcast_v16i8_to_v4f32:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <16 x i8> @test_bitcast_v8i16_to_v16i8(<8 x i16> %arg) {
+entry:
+  %res = bitcast <8 x i16> %arg to <16 x i8>
+  ret <16 x i8> %res
+; CHECK-LABEL: test_bitcast_v8i16_to_v16i8:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <8 x i16> @test_bitcast_v8i16_to_v8i16(<8 x i16> %arg) {
+entry:
+  %res = bitcast <8 x i16> %arg to <8 x i16>
+  ret <8 x i16> %res
+; CHECK-LABEL: test_bitcast_v8i16_to_v8i16:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <4 x i32> @test_bitcast_v8i16_to_v4i32(<8 x i16> %arg) {
+entry:
+  %res = bitcast <8 x i16> %arg to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: test_bitcast_v8i16_to_v4i32:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <4 x float> @test_bitcast_v8i16_to_v4f32(<8 x i16> %arg) {
+entry:
+  %res = bitcast <8 x i16> %arg to <4 x float>
+  ret <4 x float> %res
+; CHECK-LABEL: test_bitcast_v8i16_to_v4f32:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <16 x i8> @test_bitcast_v4i32_to_v16i8(<4 x i32> %arg) {
+entry:
+  %res = bitcast <4 x i32> %arg to <16 x i8>
+  ret <16 x i8> %res
+; CHECK-LABEL: test_bitcast_v4i32_to_v16i8:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <8 x i16> @test_bitcast_v4i32_to_v8i16(<4 x i32> %arg) {
+entry:
+  %res = bitcast <4 x i32> %arg to <8 x i16>
+  ret <8 x i16> %res
+; CHECK-LABEL: test_bitcast_v4i32_to_v8i16:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <4 x i32> @test_bitcast_v4i32_to_v4i32(<4 x i32> %arg) {
+entry:
+  %res = bitcast <4 x i32> %arg to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: test_bitcast_v4i32_to_v4i32:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <4 x float> @test_bitcast_v4i32_to_v4f32(<4 x i32> %arg) {
+entry:
+  %res = bitcast <4 x i32> %arg to <4 x float>
+  ret <4 x float> %res
+; CHECK-LABEL: test_bitcast_v4i32_to_v4f32:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <16 x i8> @test_bitcast_v4f32_to_v16i8(<4 x float> %arg) {
+entry:
+  %res = bitcast <4 x float> %arg to <16 x i8>
+  ret <16 x i8> %res
+; CHECK-LABEL: test_bitcast_v4f32_to_v16i8:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <8 x i16> @test_bitcast_v4f32_to_v8i16(<4 x float> %arg) {
+entry:
+  %res = bitcast <4 x float> %arg to <8 x i16>
+  ret <8 x i16> %res
+; CHECK-LABEL: test_bitcast_v4f32_to_v8i16:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <4 x i32> @test_bitcast_v4f32_to_v4i32(<4 x float> %arg) {
+entry:
+  %res = bitcast <4 x float> %arg to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: test_bitcast_v4f32_to_v4i32:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define <4 x float> @test_bitcast_v4f32_to_v4f32(<4 x float> %arg) {
+entry:
+  %res = bitcast <4 x float> %arg to <4 x float>
+  ret <4 x float> %res
+; CHECK-LABEL: test_bitcast_v4f32_to_v4f32:
+; CHECK: .L{{.*}}entry:
+; CHECK-NEXT: ret
+}
+define i8 @test_bitcast_v8i1_to_i8(<8 x i1> %arg) {
+entry:
+  %res = bitcast <8 x i1> %arg to i8
+  ret i8 %res
+; CHECK-LABEL: test_bitcast_v8i1_to_i8:
+; CHECK: call Sz_bitcast_v8i1_to_i8
+; OPTM1-LABEL: test_bitcast_v8i1_to_i8:
+; OPMT1: call Sz_bitcast_v8i1_to_i8
+}
+define i16 @test_bitcast_v16i1_to_i16(<16 x i1> %arg) {
+entry:
+  %res = bitcast <16 x i1> %arg to i16
+  ret i16 %res
+; CHECK-LABEL: test_bitcast_v16i1_to_i16:
+; CHECK: call Sz_bitcast_v16i1_to_i16
+; OPTM1-LABEL: test_bitcast_v16i1_to_i16:
+; OPMT1: call Sz_bitcast_v16i1_to_i16
+}
+define <8 x i1> @test_bitcast_i8_to_v8i1(i32 %arg) {
+entry:
+  %arg.trunc = trunc i32 %arg to i8
+  %res = bitcast i8 %arg.trunc to <8 x i1>
+  ret <8 x i1> %res
+; CHECK-LABEL: test_bitcast_i8_to_v8i1:
+; CHECK: call Sz_bitcast_i8_to_v8i1
+; OPTM1-LABEL: test_bitcast_i8_to_v8i1:
+; OPTM1: call Sz_bitcast_i8_to_v8i1
+}
+define <16 x i1> @test_bitcast_i16_to_v16i1(i32 %arg) {
+entry:
+  %arg.trunc = trunc i32 %arg to i16
+  %res = bitcast i16 %arg.trunc to <16 x i1>
+  ret <16 x i1> %res
+; CHECK-LABEL: test_bitcast_i16_to_v16i1:
+; CHECK: call Sz_bitcast_i16_to_v16i1
+; OPTM1-LABEL: test_bitcast_i16_to_v16i1:
+; OPTM1: call Sz_bitcast_i16_to_v16i1
+}
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ
--- a/tests_lit/llvm2ice_tests/vector-cast.ll
+++ b/tests_lit/llvm2ice_tests/vector-cast.ll
+; This file tests casting / conversion operations that apply to vector types.
+; bitcast operations are in vector-bitcast.ll.
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+; sext operations
+define <16 x i8> @test_sext_v16i1_to_v16i8(<16 x i1> %arg) {
+entry:
+  %res = sext <16 x i1> %arg to <16 x i8>
+  ret <16 x i8> %res
+; CHECK-LABEL: test_sext_v16i1_to_v16i8:
+; CHECK: pxor
+; CHECK: pcmpeqb
+; CHECK: psubb
+; CHECK: pand
+; CHECK: pxor
+; CHECK: pcmpgtb
+}
+define <8 x i16> @test_sext_v8i1_to_v8i16(<8 x i1> %arg) {
+entry:
+  %res = sext <8 x i1> %arg to <8 x i16>
+  ret <8 x i16> %res
+; CHECK-LABEL: test_sext_v8i1_to_v8i16:
+; CHECK: psllw {{.*}}, 15
+; CHECK: psraw {{.*}}, 15
+}
+define <4 x i32> @test_sext_v4i1_to_v4i32(<4 x i1> %arg) {
+entry:
+  %res = sext <4 x i1> %arg to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: test_sext_v4i1_to_v4i32:
+; CHECK: pslld {{.*}}, 31
+; CHECK: psrad {{.*}}, 31
+}
+; zext operations
+define <16 x i8> @test_zext_v16i1_to_v16i8(<16 x i1> %arg) {
+entry:
+  %res = zext <16 x i1> %arg to <16 x i8>
+  ret <16 x i8> %res
+; CHECK-LABEL: test_zext_v16i1_to_v16i8:
+; CHECK: pxor
+; CHECK: pcmpeqb
+; CHECK: psubb
+; CHECK: pand
+}
+define <8 x i16> @test_zext_v8i1_to_v8i16(<8 x i1> %arg) {
+entry:
+  %res = zext <8 x i1> %arg to <8 x i16>
+  ret <8 x i16> %res
+; CHECK-LABEL: test_zext_v8i1_to_v8i16:
+; CHECK: pxor
+; CHECK: pcmpeqw
+; CHECK: psubw
+; CHECK: pand
+}
+define <4 x i32> @test_zext_v4i1_to_v4i32(<4 x i1> %arg) {
+entry:
+  %res = zext <4 x i1> %arg to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: test_zext_v4i1_to_v4i32:
+; CHECK: pxor
+; CHECK: pcmpeqd
+; CHECK: psubd
+; CHECK: pand
+}
+; trunc operations
+define <16 x i1> @test_trunc_v16i8_to_v16i1(<16 x i8> %arg) {
+entry:
+  %res = trunc <16 x i8> %arg to <16 x i1>
+  ret <16 x i1> %res
+; CHECK-LABEL: test_trunc_v16i8_to_v16i1:
+; CHECK: pxor
+; CHECK: pcmpeqb
+; CHECK: psubb
+; CHECK: pand
+}
+define <8 x i1> @test_trunc_v8i16_to_v8i1(<8 x i16> %arg) {
+entry:
+  %res = trunc <8 x i16> %arg to <8 x i1>
+  ret <8 x i1> %res
+; CHECK-LABEL: test_trunc_v8i16_to_v8i1:
+; CHECK: pxor
+; CHECK: pcmpeqw
+; CHECK: psubw
+; CHECK: pand
+}
+define <4 x i1> @test_trunc_v4i32_to_v4i1(<4 x i32> %arg) {
+entry:
+  %res = trunc <4 x i32> %arg to <4 x i1>
+  ret <4 x i1> %res
+; CHECK-LABEL: test_trunc_v4i32_to_v4i1:
+; CHECK: pxor
+; CHECK: pcmpeqd
+; CHECK: psubd
+; CHECK: pand
+}
+; fpto[us]i operations
+define <4 x i32> @test_fptosi_v4f32_to_v4i32(<4 x float> %arg) {
+entry:
+  %res = fptosi <4 x float> %arg to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: test_fptosi_v4f32_to_v4i32:
+; CHECK: cvtps2dq
+}
+define <4 x i32> @test_fptoui_v4f32_to_v4i32(<4 x float> %arg) {
+entry:
+  %res = fptoui <4 x float> %arg to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: test_fptoui_v4f32_to_v4i32:
+; CHECK: call Sz_fptoui_v4f32
+}
+; [su]itofp operations
+define <4 x float> @test_sitofp_v4i32_to_v4f32(<4 x i32> %arg) {
+entry:
+  %res = sitofp <4 x i32> %arg to <4 x float>
+  ret <4 x float> %res
+; CHECK-LABEL: test_sitofp_v4i32_to_v4f32:
+; CHECK: cvtdq2ps
+}
+define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
+entry:
+  %res = uitofp <4 x i32> %arg to <4 x float>
+  ret <4 x float> %res
+; CHECK-LABEL: test_uitofp_v4i32_to_v4f32:
+; CHECK: call Sz_uitofp_v4i32
+}
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ