Lower the fcmp instruction for <4 x float> operands.

Most fcmp conditions map directly to single x86 instructions. For these, the lowering is table driven. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/413053002

Lower the fcmp instruction for <4 x float> operands.
ce0ca8f8 · Matt Wala · 9cb61e2f · ce0ca8f8 · ce0ca8f8 · ce0ca8f8
Commit ce0ca8f8 authored Jul 24, 2014 by Matt Wala
9 changed files
--- a/crosstest/test_fcmp.pnacl.ll
+++ b/crosstest/test_fcmp.pnacl.ll
 target triple = "i686-pc-linux-gnu"
-; This file is extracted from fp.pnacl.ll in the lit tests, with
+; This file is extracted from fp.pnacl.ll and vector-fcmp.ll in the lit
-; the "internal" attribute removed from the functions.
+; tests, with the "internal" attribute removed from the functions.
 define i32 @fcmpFalseFloat(float %a, float %b) {
 entry:
@@ -322,3 +322,151 @@ entry:
 }
 ; CHECK: fcmpTrueDouble:
 ; CHECK: mov {{.*}}, 1
+define <4 x i32> @fcmpFalseVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp false <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpFalseVector:
+; CHECK: pxor
+}
+define <4 x i32> @fcmpOeqVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp oeq <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOeqVector:
+; CHECK: cmpeqps
+}
+define <4 x i32> @fcmpOgeVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp oge <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOgeVector:
+; CHECK: cmpleps
+}
+define <4 x i32> @fcmpOgtVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ogt <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOgtVector:
+; CHECK: cmpltps
+}
+define <4 x i32> @fcmpOleVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ole <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOleVector:
+; CHECK: cmpleps
+}
+define <4 x i32> @fcmpOltVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp olt <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOltVector:
+; CHECK: cmpltps
+}
+define <4 x i32> @fcmpOneVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp one <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOneVector:
+; CHECK: cmpneqps
+; CHECK: cmpordps
+; CHECK: pand
+}
+define <4 x i32> @fcmpOrdVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ord <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOrdVector:
+; CHECK: cmpordps
+}
+define <4 x i32> @fcmpTrueVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp true <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpTrueVector:
+; CHECK: pcmpeqd
+}
+define <4 x i32> @fcmpUeqVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ueq <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUeqVector:
+; CHECK: cmpeqps
+; CHECK: cmpunordps
+; CHECK: por
+}
+define <4 x i32> @fcmpUgeVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp uge <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUgeVector:
+; CHECK: cmpnltps
+}
+define <4 x i32> @fcmpUgtVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ugt <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUgtVector:
+; CHECK: cmpnleps
+}
+define <4 x i32> @fcmpUleVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ule <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUleVector:
+; CHECK: cmpnltps
+}
+define <4 x i32> @fcmpUltVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ult <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUltVector:
+; CHECK: cmpnleps
+}
+define <4 x i32> @fcmpUneVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp une <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUneVector:
+; CHECK: cmpneqps
+}
+define <4 x i32> @fcmpUnoVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp uno <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUnoVector:
+; CHECK: cmpunordps
+}
--- a/crosstest/test_fcmp_main.cpp
+++ b/crosstest/test_fcmp_main.cpp
+//===- subzero/crosstest/test_fcmp_main.cpp - Driver for tests ------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for cross testing the fcmp bitcode instruction
+//
+//===----------------------------------------------------------------------===//
 /* crosstest.py --test=test_fcmp.pnacl.ll --driver=test_fcmp_main.cpp \
   --prefix=Subzero_ --output=test_fcmp */
 #include <cassert>
 #include <cfloat>
 #include <cmath>
+#include <cstring>
 #include <iostream>
+#include "vectors.h"
 #include "test_fcmp.def"
 #define X(cmp)                                                                 \
  extern "C" bool fcmp##cmp##Float(float a, float b);                          \
  extern "C" bool fcmp##cmp##Double(double a, double b);                       \
+  extern "C" v4si32 fcmp##cmp##Vector(v4f32 a, v4f32 b);                       \
  extern "C" bool Subzero_fcmp##cmp##Float(float a, float b);                  \
-  extern "C" bool Subzero_fcmp##cmp##Double(double a, double b);
+  extern "C" bool Subzero_fcmp##cmp##Double(double a, double b);               \
+  extern "C" v4si32 Subzero_fcmp##cmp##Vector(v4f32 a, v4f32 b);
 FCMP_TABLE;
 #undef X
-int main(int argc, char **argv) {
+volatile double *Values;
+size_t NumValues;
+void initializeValues() {
  static const double NegInf = -1.0 / 0.0;
  static const double Zero = 0.0;
  static const double Ten = 10.0;
@@ -30,12 +50,14 @@ int main(int argc, char **argv) {
  assert(NegInf < Zero);
  assert(NegInf < PosInf);
  assert(Zero < PosInf);
+  static volatile double InitValues[] = {NegInf,  -Zero, Zero,    DBL_MIN,
+                                         FLT_MIN, Ten,   FLT_MAX, DBL_MAX,
+                                         PosInf,  Nan,   NegNan};
+  NumValues = sizeof(InitValues) / sizeof(*InitValues);
+  Values = InitValues;
+}
-  volatile double Values[] = { NegInf, -Zero,   Zero,    DBL_MIN, FLT_MIN,
+void testsScalar(size_t &TotalTests, size_t &Passes, size_t &Failures) {
-                               Ten,    FLT_MAX, DBL_MAX, PosInf,  Nan,
-                               NegNan };
-  const static size_t NumValues = sizeof(Values) / sizeof(*Values);
  typedef bool (*FuncTypeFloat)(float, float);
  typedef bool (*FuncTypeDouble)(double, double);
  static struct {
@@ -58,9 +80,7 @@ int main(int argc, char **argv) {
  bool ResultSz, ResultLlc;
-  size_t TotalTests = 0;
+  assert(Values && NumValues);
-  size_t Passes = 0;
-  size_t Failures = 0;
  for (size_t f = 0; f < NumFuncs; ++f) {
    for (size_t i = 0; i < NumValues; ++i) {
@@ -76,7 +96,7 @@ int main(int argc, char **argv) {
          ++Failures;
          std::cout << Funcs[f].Name << "Float(" << Value1Float << ", "
                    << Value2Float << "): sz=" << ResultSz
-                    << " llc=" << ResultLlc << std::endl;
+                    << " llc=" << ResultLlc << "\n";
        }
        ++TotalTests;
        double Value1Double = Values[i];
@@ -89,11 +109,66 @@ int main(int argc, char **argv) {
          ++Failures;
          std::cout << Funcs[f].Name << "Double(" << Value1Double << ", "
                    << Value2Double << "): sz=" << ResultSz
-                    << " llc=" << ResultLlc << std::endl;
+                    << " llc=" << ResultLlc << "\n";
+        }
+      }
+    }
+  }
+}
+void testsVector(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef v4si32 (*FuncTypeVector)(v4f32, v4f32);
+  static struct {
+    const char *Name;
+    FuncTypeVector FuncVectorSz;
+    FuncTypeVector FuncVectorLlc;
+  } Funcs[] = {
+#define X(cmp)                                                                 \
+  { "fcmp" STR(cmp), Subzero_fcmp##cmp##Vector, fcmp##cmp##Vector }            \
+  ,
+        FCMP_TABLE
+#undef X
+    };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  const static size_t NumElementsInType = 4;
+  const static size_t MaxTestsPerFunc = 100000;
+  assert(Values && NumValues);
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    PRNG Index;
+    for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+      v4f32 Value1, Value2;
+      for (size_t j = 0; j < NumElementsInType; ++j) {
+        Value1[j] = Values[Index() % NumValues];
+        Value2[j] = Values[Index() % NumValues];
      }
+      ++TotalTests;
+      v4si32 ResultSz, ResultLlc;
+      ResultSz = Funcs[f].FuncVectorSz(Value1, Value2);
+      ResultLlc = Funcs[f].FuncVectorLlc(Value1, Value2);
+      if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << Funcs[f].Name << "Vector(" << vectAsString<v4f32>(Value1)
+                  << ", " << vectAsString<v4f32>(Value2)
+                  << "): sz=" << vectAsString<v4si32>(ResultSz)
+                  << " llc=" << vectAsString<v4si32>(ResultLlc) << "\n";
      }
    }
  }
+}
+int main(int argc, char **argv) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+  initializeValues();
+  testsScalar(TotalTests, Passes, Failures);
+  testsVector(TotalTests, Passes, Failures);
  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
            << " Failures=" << Failures << "\n";

--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -36,6 +36,18 @@ const struct InstX8632BrAttributes_ {
 const size_t InstX8632BrAttributesSize =
    llvm::array_lengthof(InstX8632BrAttributes);
+const struct InstX8632CmppsAttributes_ {
+  const char *EmitString;
+} InstX8632CmppsAttributes[] = {
+#define X(tag, emit)                                                           \
+  { emit }                                                                     \
+  ,
+    ICEINSTX8632CMPPS_TABLE
+#undef X
+  };
+const size_t InstX8632CmppsAttributesSize =
+    llvm::array_lengthof(InstX8632CmppsAttributes);
 const struct TypeX8632Attributes_ {
  const char *CvtString;   // i (integer), s (single FP), d (double FP)
  const char *SdSsString;  // ss, sd, or <blank>
@@ -149,6 +161,13 @@ InstX8632Cmov::InstX8632Cmov(Cfg *Func, Variable *Dest, Operand *Source,
  addSource(Source);
 }
+InstX8632Cmpps::InstX8632Cmpps(Cfg *Func, Variable *Dest, Operand *Source,
+                               InstX8632Cmpps::CmppsCond Condition)
+    : InstX8632(Func, InstX8632::Cmpps, 2, Dest), Condition(Condition) {
+  addSource(Dest);
+  addSource(Source);
+}
 InstX8632Cmpxchg::InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr,
                                   Variable *Eax, Variable *Desired,
                                   bool Locked)
@@ -695,6 +714,28 @@ void InstX8632Cmov::dump(const Cfg *Func) const {
  dumpSources(Func);
 }
+void InstX8632Cmpps::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  assert(Condition < InstX8632CmppsAttributesSize);
+  Str << "\t";
+  Str << "cmp" << InstX8632CmppsAttributes[Condition].EmitString << "ps"
+      << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+void InstX8632Cmpps::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  assert(Condition < InstX8632CmppsAttributesSize);
+  dumpDest(Func);
+  Str << " = cmp" << InstX8632CmppsAttributes[Condition].EmitString << "ps"
+      << "\t";
+  dumpSources(Func);
+}
 void InstX8632Cmpxchg::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 3);

--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -66,6 +66,18 @@
  X(Br_p,        "p",  "jp")   \
 //#define X(tag, dump, emit)
+#define ICEINSTX8632CMPPS_TABLE \
+  /* enum value, emit */        \
+  X(Cmpps_eq,    "eq")          \
+  X(Cmpps_lt,    "lt")          \
+  X(Cmpps_le,    "le")          \
+  X(Cmpps_unord, "unord")       \
+  X(Cmpps_neq,   "neq")         \
+  X(Cmpps_nlt,   "nlt")         \
+  X(Cmpps_nle,   "nle")         \
+  X(Cmpps_ord,   "ord")         \
+//#define X(tag, emit)
 #define ICETYPEX8632_TABLE                                        \
  /* tag,          element type, cvt, sdss,  pack, width */       \
  X(IceType_void,  IceType_void, "?" , ""  , "" ,  "???")         \

--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -145,6 +145,7 @@ public:
    Call,
    Cdq,
    Cmov,
+    Cmpps,
    Cmpxchg,
    Cmpxchg8b,
    Cvt,
@@ -714,6 +715,35 @@ private:
  BrCond Condition;
 };
+// Cmpps instruction - compare packed singled-precision floating point
+// values
+class InstX8632Cmpps : public InstX8632 {
+public:
+  enum CmppsCond {
+#define X(tag, emit) tag,
+    ICEINSTX8632CMPPS_TABLE
+#undef X
+    Cmpps_Invalid
+  };
+  static InstX8632Cmpps *create(Cfg *Func, Variable *Dest, Operand *Source,
+                                CmppsCond Condition) {
+    return new (Func->allocate<InstX8632Cmpps>())
+        InstX8632Cmpps(Func, Dest, Source, Condition);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpps); }
+private:
+  InstX8632Cmpps(Cfg *Func, Variable *Dest, Operand *Source, CmppsCond Cond);
+  InstX8632Cmpps(const InstX8632Cmpps &) LLVM_DELETED_FUNCTION;
+  InstX8632Cmpps &operator=(const InstX8632Cmpps &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Cmpps() {}
+  CmppsCond Condition;
+};
 // Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
 // equals eax. If so, the ZF is set and <desired> is stored in <dest>.
 // If not, ZF is cleared and <dest> is copied to eax (or subregister).

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -27,26 +27,38 @@ namespace Ice {
 namespace {
-// The following table summarizes the logic for lowering the fcmp instruction.
+// The following table summarizes the logic for lowering the fcmp
-// There is one table entry for each of the 16 conditions.  A comment in
+// instruction.  There is one table entry for each of the 16 conditions.
-// lowerFcmp() describes the lowering template.  In the most general case, there
+//
-// is a compare followed by two conditional branches, because some fcmp
+// The first four columns describe the case when the operands are
-// conditions don't map to a single x86 conditional branch.  However, in many
+// floating point scalar values.  A comment in lowerFcmp() describes the
-// cases it is possible to swap the operands in the comparison and have a single
+// lowering template.  In the most general case, there is a compare
-// conditional branch.  Since it's quite tedious to validate the table by hand,
+// followed by two conditional branches, because some fcmp conditions
-// good execution tests are helpful.
+// don't map to a single x86 conditional branch.  However, in many cases
+// it is possible to swap the operands in the comparison and have a
+// single conditional branch.  Since it's quite tedious to validate the
+// table by hand, good execution tests are helpful.
+//
+// The last two columns describe the case when the operands are vectors
+// of floating point values.  For most fcmp conditions, there is a clear
+// mapping to a single x86 cmpps instruction variant.  Some fcmp
+// conditions require special code to handle and these are marked in the
+// table with a Cmpps_Invalid predicate.
 const struct TableFcmp_ {
  uint32_t Default;
-  bool SwapOperands;
+  bool SwapScalarOperands;
  InstX8632::BrCond C1, C2;
+  bool SwapVectorOperands;
+  InstX8632Cmpps::CmppsCond Predicate;
 } TableFcmp[] = {
-#define X(val, dflt, swap, C1, C2)                                             \
+#define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
-  { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 }                             \
+  {                                                                            \
+    dflt, swapS, InstX8632Br::C1, InstX8632Br::C2, swapV, InstX8632Cmpps::pred \
+  }                                                                            \
  ,
      FCMPX8632_TABLE
 #undef X
-  };
+};
 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);
 // The following table summarizes the logic for lowering the icmp instruction
@@ -138,7 +150,7 @@ void xMacroIntegrityCheck() {
    // Define a temporary set of enum values based on low-level
    // table entries.
    enum _tmp_enum {
-#define X(val, dflt, swap, C1, C2) _tmp_##val,
+#define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
      FCMPX8632_TABLE
 #undef X
          _num
@@ -149,7 +161,7 @@ void xMacroIntegrityCheck() {
 #undef X
 // Define a set of constants based on low-level table entries,
 // and ensure the table entry keys are consistent.
-#define X(val, dflt, swap, C1, C2)                                             \
+#define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
  static const int _table2_##val = _tmp_##val;                                 \
  STATIC_ASSERT(_table1_##val == _table2_##val);
    FCMPX8632_TABLE;
@@ -2213,6 +2225,68 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
  Operand *Src0 = Inst->getSrc(0);
  Operand *Src1 = Inst->getSrc(1);
  Variable *Dest = Inst->getDest();
+  if (isVectorType(Dest->getType())) {
+    InstFcmp::FCond Condition = Inst->getCondition();
+    size_t Index = static_cast<size_t>(Condition);
+    assert(Index < TableFcmpSize);
+    if (TableFcmp[Index].SwapVectorOperands) {
+      Operand *T = Src0;
+      Src0 = Src1;
+      Src1 = T;
+    }
+    Variable *T = NULL;
+    // ALIGNHACK: Without support for stack alignment, both operands to
+    // cmpps need to be forced into registers.  Once support for stack
+    // alignment is implemented, remove LEGAL_HACK.
+#define LEGAL_HACK(Vect) legalizeToVar((Vect))
+    switch (Condition) {
+    default: {
+      InstX8632Cmpps::CmppsCond Predicate = TableFcmp[Index].Predicate;
+      assert(Predicate != InstX8632Cmpps::Cmpps_Invalid);
+      T = makeReg(Src0->getType());
+      _movp(T, Src0);
+      _cmpps(T, LEGAL_HACK(Src1), Predicate);
+    } break;
+    case InstFcmp::False:
+      T = makeVectorOfZeros(Src0->getType());
+      break;
+    case InstFcmp::One: {
+      // Check both unequal and ordered.
+      T = makeReg(Src0->getType());
+      Variable *T2 = makeReg(Src0->getType());
+      Src1 = LEGAL_HACK(Src1);
+      _movp(T, Src0);
+      _cmpps(T, Src1, InstX8632Cmpps::Cmpps_neq);
+      _movp(T2, Src0);
+      _cmpps(T2, Src1, InstX8632Cmpps::Cmpps_ord);
+      _pand(T, T2);
+    } break;
+    case InstFcmp::Ueq: {
+      // Check both equal or unordered.
+      T = makeReg(Src0->getType());
+      Variable *T2 = makeReg(Src0->getType());
+      Src1 = LEGAL_HACK(Src1);
+      _movp(T, Src0);
+      _cmpps(T, Src1, InstX8632Cmpps::Cmpps_eq);
+      _movp(T2, Src0);
+      _cmpps(T2, Src1, InstX8632Cmpps::Cmpps_unord);
+      _por(T, T2);
+    } break;
+    case InstFcmp::True:
+      T = makeVectorOfMinusOnes(IceType_v4i32);
+      break;
+    }
+#undef LEGAL_HACK
+    _movp(Dest, T);
+    eliminateNextVectorSextInstruction(Dest);
+    return;
+  }
  // Lowering a = fcmp cond, b, c
  //   ucomiss b, c       /* only if C1 != Br_None */
  //                      /* but swap b,c order if SwapOperands==true */
@@ -2225,7 +2299,7 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
  InstFcmp::FCond Condition = Inst->getCondition();
  size_t Index = static_cast<size_t>(Condition);
  assert(Index < TableFcmpSize);
-  if (TableFcmp[Index].SwapOperands) {
+  if (TableFcmp[Index].SwapScalarOperands) {
    Operand *Tmp = Src0;
    Src0 = Src1;
    Src1 = Tmp;
@@ -2356,26 +2430,7 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
 #undef LEGAL_HACK
    _movp(Dest, T);
+    eliminateNextVectorSextInstruction(Dest);
-    // The following pattern occurs often in lowered C and C++ code:
-    //
-    //   %cmp     = icmp pred <n x ty> %src0, %src1
-    //   %cmp.ext = sext <n x i1> %cmp to <n x ty>
-    //
-    // We can avoid the sext operation by copying the result from pcmpgt
-    // and pcmpeq, which is already sign extended, to the result of the
-    // sext operation
-    if (InstCast *NextCast =
-            llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
-      if (NextCast->getCastKind() == InstCast::Sext &&
-          NextCast->getSrc(0) == Dest) {
-        _movp(NextCast->getDest(), T);
-        // Skip over the instruction.
-        NextCast->setDeleted();
-        Context.advanceNext();
-      }
-    }
    return;
  }
@@ -3544,6 +3599,28 @@ void TargetX8632::lowerSwitch(const InstSwitch *Inst) {
  _br(Inst->getLabelDefault());
 }
+// The following pattern occurs often in lowered C and C++ code:
+//
+//   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
+//   %cmp.ext = sext <n x i1> %cmp to <n x ty>
+//
+// We can eliminate the sext operation by copying the result of pcmpeqd,
+// pcmpgtd, or cmpps (which produce sign extended results) to the result
+// of the sext operation.
+void
+TargetX8632::eliminateNextVectorSextInstruction(Variable *SignExtendedResult) {
+  if (InstCast *NextCast =
+          llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
+    if (NextCast->getCastKind() == InstCast::Sext &&
+        NextCast->getSrc(0) == SignExtendedResult) {
+      _movp(NextCast->getDest(), legalizeToVar(SignExtendedResult));
+      // Skip over the instruction.
+      NextCast->setDeleted();
+      Context.advanceNext();
+    }
+  }
+}
 void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) {
  const SizeT MaxSrcs = 0;
  Variable *Dest = NULL;

--- a/src/IceTargetLoweringX8632.def
+++ b/src/IceTargetLoweringX8632.def
@@ -16,24 +16,25 @@
 #define SUBZERO_SRC_ICETARGETLOWERINGX8632_DEF
 #define FCMPX8632_TABLE                                              \
-  /* val,  dflt, swap, C1,      C2 */    \
+  /*       <---- scalar comparison ---->  <- vector comparison -> */ \
-  X(False, 0,    0,    Br_None, Br_None) \
+  /* val,  dflt, swap, C1,      C2,       swap,  predicate        */ \
-  X(Oeq,   0,    0,    Br_ne,   Br_p)    \
+  X(False, 0,    0,    Br_None, Br_None,  0,     Cmpps_Invalid)      \
-  X(Ogt,   1,    0,    Br_a,    Br_None) \
+  X(Oeq,   0,    0,    Br_ne,   Br_p,     0,     Cmpps_eq)           \
-  X(Oge,   1,    0,    Br_ae,   Br_None) \
+  X(Ogt,   1,    0,    Br_a,    Br_None,  1,     Cmpps_lt)           \
-  X(Olt,   1,    1,    Br_a,    Br_None) \
+  X(Oge,   1,    0,    Br_ae,   Br_None,  1,     Cmpps_le)           \
-  X(Ole,   1,    1,    Br_ae,   Br_None) \
+  X(Olt,   1,    1,    Br_a,    Br_None,  0,     Cmpps_lt)           \
-  X(One,   1,    0,    Br_ne,   Br_None) \
+  X(Ole,   1,    1,    Br_ae,   Br_None,  0,     Cmpps_le)           \
-  X(Ord,   1,    0,    Br_np,   Br_None) \
+  X(One,   1,    0,    Br_ne,   Br_None,  0,     Cmpps_Invalid)      \
-  X(Ueq,   1,    0,    Br_e,    Br_None) \
+  X(Ord,   1,    0,    Br_np,   Br_None,  0,     Cmpps_ord)          \
-  X(Ugt,   1,    1,    Br_b,    Br_None) \
+  X(Ueq,   1,    0,    Br_e,    Br_None,  0,     Cmpps_Invalid)      \
-  X(Uge,   1,    1,    Br_be,   Br_None) \
+  X(Ugt,   1,    1,    Br_b,    Br_None,  0,     Cmpps_nle)          \
-  X(Ult,   1,    0,    Br_b,    Br_None) \
+  X(Uge,   1,    1,    Br_be,   Br_None,  0,     Cmpps_nlt)          \
-  X(Ule,   1,    0,    Br_be,   Br_None) \
+  X(Ult,   1,    0,    Br_b,    Br_None,  1,     Cmpps_nle)          \
-  X(Une,   1,    0,    Br_ne,   Br_p)    \
+  X(Ule,   1,    0,    Br_be,   Br_None,  1,     Cmpps_nlt)          \
-  X(Uno,   1,    0,    Br_p,    Br_None) \
+  X(Une,   1,    0,    Br_ne,   Br_p,     0,     Cmpps_neq)          \
-  X(True,  1,    0,    Br_None, Br_None) \
+  X(Uno,   1,    0,    Br_p,    Br_None,  0,     Cmpps_unord)        \
-//#define X(val, dflt, swap, C1, C2)
+  X(True,  1,    0,    Br_None, Br_None,  0,     Cmpps_Invalid)      \
+//#define X(val, dflt, swapS, C1, C2, swapV, pred)
 #define ICMPX8632_TABLE                     \
  /* val, C_32,  C1_64,   C2_64,   C3_64 */ \

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -108,6 +108,8 @@ protected:
  void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
                                Variable *Dest, Operand *Ptr, Operand *Val);
+  void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
  // Operand legalization helpers.  To deal with address mode
  // constraints, the helpers will create a new Operand and emit
  // instructions that guarantee that the Operand kind is one of those
@@ -212,6 +214,10 @@ protected:
  void _cmp(Operand *Src0, Operand *Src1) {
    Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
  }
+  void _cmpps(Variable *Dest, Operand *Src0,
+              InstX8632Cmpps::CmppsCond Condition) {
+    Context.insert(InstX8632Cmpps::create(Func, Dest, Src0, Condition));
+  }
  void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
                bool Locked) {
    Context.insert(

--- a/tests_lit/llvm2ice_tests/vector-fcmp.ll
+++ b/tests_lit/llvm2ice_tests/vector-fcmp.ll
+; This file checks support for comparing vector values with the fcmp
+; instruction.
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+; Check that sext elimination occurs when the result of the comparison
+; instruction is alrady sign extended.  Sign extension to 4 x i32 uses
+; the pslld instruction.
+define <4 x i32> @sextElimination(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp oeq <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: sextElimination:
+; CHECK: cmpeqps
+; CHECK-NOT: pslld
+}
+define <4 x i32> @fcmpFalseVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp false <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpFalseVector:
+; CHECK: pxor
+}
+define <4 x i32> @fcmpOeqVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp oeq <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOeqVector:
+; CHECK: cmpeqps
+}
+define <4 x i32> @fcmpOgeVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp oge <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOgeVector:
+; CHECK: cmpleps
+}
+define <4 x i32> @fcmpOgtVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ogt <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOgtVector:
+; CHECK: cmpltps
+}
+define <4 x i32> @fcmpOleVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ole <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOleVector:
+; CHECK: cmpleps
+}
+define <4 x i32> @fcmpOltVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp olt <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOltVector:
+; CHECK: cmpltps
+}
+define <4 x i32> @fcmpOneVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp one <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOneVector:
+; CHECK: cmpneqps
+; CHECK: cmpordps
+; CHECK: pand
+}
+define <4 x i32> @fcmpOrdVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ord <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpOrdVector:
+; CHECK: cmpordps
+}
+define <4 x i32> @fcmpTrueVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp true <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpTrueVector:
+; CHECK: pcmpeqd
+}
+define <4 x i32> @fcmpUeqVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ueq <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUeqVector:
+; CHECK: cmpeqps
+; CHECK: cmpunordps
+; CHECK: por
+}
+define <4 x i32> @fcmpUgeVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp uge <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUgeVector:
+; CHECK: cmpnltps
+}
+define <4 x i32> @fcmpUgtVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ugt <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUgtVector:
+; CHECK: cmpnleps
+}
+define <4 x i32> @fcmpUleVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ule <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUleVector:
+; CHECK: cmpnltps
+}
+define <4 x i32> @fcmpUltVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp ult <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUltVector:
+; CHECK: cmpnleps
+}
+define <4 x i32> @fcmpUneVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp une <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUneVector:
+; CHECK: cmpneqps
+}
+define <4 x i32> @fcmpUnoVector(<4 x float> %a, <4 x float> %b) {
+entry:
+  %res.trunc = fcmp uno <4 x float> %a, %b
+  %res = sext <4 x i1> %res.trunc to <4 x i32>
+  ret <4 x i32> %res
+; CHECK-LABEL: fcmpUnoVector:
+; CHECK: cmpunordps
+}
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ