Lower icmp operations between vector values.

SSE2 only has signed integer comparison. Unsigned compares are implemented by inverting the sign bits of the operands and doing a signed compare. A common pattern in clang generated IR is a vector compare which generates an i1 vector followed by a sign extension of the result of the compare. The x86 comparison instructions already generate sign extended values, so we can eliminate unnecessary sext operations that follow compares in the IR. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/412593002

Lower icmp operations between vector values.
9a0168a9 · Matt Wala · 87543355 · 9a0168a9 · 9a0168a9 · 9a0168a9
Commit 9a0168a9 authored Jul 23, 2014 by Matt Wala
8 changed files
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -67,7 +67,7 @@ for optlevel in ${OPTLEVELS} ; do
    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
        --dir="${OUTDIR}" \
        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_icmp.cpp \
+        --test=test_icmp.cpp --test=test_icmp_i1vec.ll \
        --driver=test_icmp_main.cpp \
        --output=test_icmp_O${optlevel}

--- a/crosstest/test_icmp.cpp
+++ b/crosstest/test_icmp.cpp
-// This aims to test the icmp bitcode instruction across all PNaCl
+//===- subzero/crosstest/test_icmp.cpp - Implementation for tests ---------===//
-// primitive integer types.
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This aims to test the icmp bitcode instruction across all PNaCl primitive
+// and SIMD integer types.
+//
+//===----------------------------------------------------------------------===//
 #include <stdint.h>
@@ -9,13 +20,20 @@
  bool icmp##cmp(uint8_t a, uint8_t b) { return a op b; }                      \
  bool icmp##cmp(uint16_t a, uint16_t b) { return a op b; }                    \
  bool icmp##cmp(uint32_t a, uint32_t b) { return a op b; }                    \
-  bool icmp##cmp(uint64_t a, uint64_t b) { return a op b; }
+  bool icmp##cmp(uint64_t a, uint64_t b) { return a op b; }                    \
+  v4ui32 icmp##cmp(v4ui32 a, v4ui32 b) { return a op b; }                      \
+  v8ui16 icmp##cmp(v8ui16 a, v8ui16 b) { return a op b; }                      \
+  v16ui8 icmp##cmp(v16ui8 a, v16ui8 b) { return a op b; }
 ICMP_U_TABLE
 #undef X
 #define X(cmp, op)                                                             \
  bool icmp##cmp(int8_t a, int8_t b) { return a op b; }                        \
  bool icmp##cmp(int16_t a, int16_t b) { return a op b; }                      \
  bool icmp##cmp(int32_t a, int32_t b) { return a op b; }                      \
-  bool icmp##cmp(int64_t a, int64_t b) { return a op b; }
+  bool icmp##cmp(int64_t a, int64_t b) { return a op b; }                      \
+  v4si32 icmp##cmp(v4si32 a, v4si32 b) { return a op b; }                      \
+  v8si16 icmp##cmp(v8si16 a, v8si16 b) { return a op b; }                      \
+  v16si8 icmp##cmp(v16si8 a, v16si8 b) { return a op b; }
 ICMP_S_TABLE
 #undef X
--- a/crosstest/test_icmp.h
+++ b/crosstest/test_icmp.h
+//===- subzero/crosstest/test_icmp.h - Test prototypes -------*- C++ -*----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for crosstesting the icmp
+// bitcode instruction.
+//
+//===----------------------------------------------------------------------===//
 #include "test_icmp.def"
+#include "vectors.h"
 #define X(cmp, op)                                                             \
  bool icmp##cmp(uint8_t a, uint8_t b);                                        \
  bool icmp##cmp(uint16_t a, uint16_t b);                                      \
  bool icmp##cmp(uint32_t a, uint32_t b);                                      \
-  bool icmp##cmp(uint64_t a, uint64_t b);
+  bool icmp##cmp(uint64_t a, uint64_t b);                                      \
+  v4ui32 icmp##cmp(v4ui32 a, v4ui32 b);                                        \
+  v8ui16 icmp##cmp(v8ui16 a, v8ui16 b);                                        \
+  v16ui8 icmp##cmp(v16ui8 a, v16ui8 b);
 ICMP_U_TABLE
 #undef X
@@ -12,6 +31,17 @@ ICMP_U_TABLE
  bool icmp##cmp(int8_t a, int8_t b);                                          \
  bool icmp##cmp(int16_t a, int16_t b);                                        \
  bool icmp##cmp(int32_t a, int32_t b);                                        \
-  bool icmp##cmp(int64_t a, int64_t b);
+  bool icmp##cmp(int64_t a, int64_t b);                                        \
+  v4si32 icmp##cmp(v4si32 a, v4si32 b);                                        \
+  v8si16 icmp##cmp(v8si16 a, v8si16 b);                                        \
+  v16si8 icmp##cmp(v16si8 a, v16si8 b);
+ICMP_S_TABLE
+#undef X
+#define X(cmp, op)                                                             \
+  v4si32 icmpi1##cmp(v4si32 a, v4si32 b);                                      \
+  v8si16 icmpi1##cmp(v8si16 a, v8si16 b);                                      \
+  v16si8 icmpi1##cmp(v16si8 a, v16si8 b);
+ICMP_U_TABLE
 ICMP_S_TABLE
 #undef X
--- a/crosstest/test_icmp_i1vec.ll
+++ b/crosstest/test_icmp_i1vec.ll
+target triple = "i686-pc-linux-gnu"
+define <16 x i8> @_Z8icmpi1EqDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp eq <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+define <16 x i8> @_Z8icmpi1NeDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ne <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+define <16 x i8> @_Z9icmpi1UgtDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ugt <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+define <16 x i8> @_Z9icmpi1UgeDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp uge <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+define <16 x i8> @_Z9icmpi1UltDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ult <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+define <16 x i8> @_Z9icmpi1UleDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp ule <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+define <16 x i8> @_Z9icmpi1SgtDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp sgt <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+define <16 x i8> @_Z9icmpi1SgeDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp sge <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+define <16 x i8> @_Z9icmpi1SltDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp slt <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+define <16 x i8> @_Z9icmpi1SleDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %a.trunc = trunc <16 x i8> %a to <16 x i1>
+  %b.trunc = trunc <16 x i8> %b to <16 x i1>
+  %cmp = icmp sle <16 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %cmp.sext
+}
+define <8 x i16> @_Z8icmpi1EqDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp eq <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+define <8 x i16> @_Z8icmpi1NeDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ne <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+define <8 x i16> @_Z9icmpi1UgtDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ugt <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+define <8 x i16> @_Z9icmpi1UgeDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp uge <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+define <8 x i16> @_Z9icmpi1UltDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ult <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+define <8 x i16> @_Z9icmpi1UleDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp ule <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+define <8 x i16> @_Z9icmpi1SgtDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp sgt <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+define <8 x i16> @_Z9icmpi1SgeDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp sge <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+define <8 x i16> @_Z9icmpi1SltDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp slt <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+define <8 x i16> @_Z9icmpi1SleDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %a.trunc = trunc <8 x i16> %a to <8 x i1>
+  %b.trunc = trunc <8 x i16> %b to <8 x i1>
+  %cmp = icmp sle <8 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %cmp.sext
+}
+define <4 x i32> @_Z8icmpi1EqDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp eq <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+define <4 x i32> @_Z8icmpi1NeDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ne <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+define <4 x i32> @_Z9icmpi1UgtDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ugt <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+define <4 x i32> @_Z9icmpi1UgeDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp uge <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+define <4 x i32> @_Z9icmpi1UltDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ult <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+define <4 x i32> @_Z9icmpi1UleDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp ule <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+define <4 x i32> @_Z9icmpi1SgtDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp sgt <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+define <4 x i32> @_Z9icmpi1SgeDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp sge <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+define <4 x i32> @_Z9icmpi1SltDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp slt <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
+define <4 x i32> @_Z9icmpi1SleDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %a.trunc = trunc <4 x i32> %a to <4 x i1>
+  %b.trunc = trunc <4 x i32> %b to <4 x i1>
+  %cmp = icmp sle <4 x i1> %a.trunc, %b.trunc
+  %cmp.sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %cmp.sext
+}
--- a/crosstest/test_icmp_main.cpp
+++ b/crosstest/test_icmp_main.cpp
-/* crosstest.py --test=test_icmp.cpp --driver=test_icmp_main.cpp \
+//===- subzero/crosstest/test_icmp_main.cpp - Driver for tests. -----------===//
-   --prefix=Subzero_ --output=test_icmp */
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for cross testing the icmp bitcode instruction
+//
+//===----------------------------------------------------------------------===//
+/* crosstest.py --test=test_icmp.cpp --test=test_icmp_i1vec.ll \
+   --driver=test_icmp_main.cpp --prefix=Subzero_ --output=test_icmp */
+#include <climits> // CHAR_BIT
+#include <cstring> // memcmp, memset
 #include <stdint.h>
 #include <iostream>
@@ -63,8 +78,9 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
            ++Passes;
          } else {
            ++Failures;
-            std::cout << "icmp" << Funcs[f].Name << (8 * sizeof(TypeUnsigned))
+            std::cout << "icmp" << Funcs[f].Name
-                      << "(" << Value1 << ", " << Value2 << "): sz=" << ResultSz
+                      << (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value1
+                      << ", " << Value2 << "): sz=" << ResultSz
                      << " llc=" << ResultLlc << std::endl;
          }
        }
@@ -90,8 +106,8 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
              } else {
                ++Failures;
                std::cout << "icmp" << Funcs[f].Name
-                          << (8 * sizeof(TypeUnsigned)) << "(" << Value1 << ", "
+                          << (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value1
-                          << Value2 << "): sz=" << ResultSz
+                          << ", " << Value2 << "): sz=" << ResultSz
                          << " llc=" << ResultLlc << std::endl;
              }
            }
@@ -102,6 +118,155 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
  }
 }
+const static size_t MaxTestsPerFunc = 100000;
+template <typename TypeUnsignedLabel, typename TypeSignedLabel>
+void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename Vectors<TypeUnsignedLabel>::Ty TypeUnsigned;
+  typedef typename Vectors<TypeSignedLabel>::Ty TypeSigned;
+  typedef TypeUnsigned (*FuncTypeUnsigned)(TypeUnsigned, TypeUnsigned);
+  typedef TypeSigned (*FuncTypeSigned)(TypeSigned, TypeSigned);
+  static struct {
+    const char *Name;
+    FuncTypeUnsigned FuncLlc;
+    FuncTypeUnsigned FuncSz;
+  } Funcs[] = {
+#define X(cmp, op)                                                             \
+  {                                                                            \
+    STR(inst), (FuncTypeUnsigned)icmp##cmp,                                    \
+        (FuncTypeUnsigned)Subzero_::icmp##cmp                                  \
+  }                                                                            \
+  ,
+        ICMP_U_TABLE
+#undef X
+#define X(cmp, op)                                                             \
+  {                                                                            \
+    STR(inst), (FuncTypeUnsigned)(FuncTypeSigned)icmp##cmp,                    \
+        (FuncTypeUnsigned)(FuncTypeSigned)Subzero_::icmp##cmp                  \
+  }                                                                            \
+  ,
+        ICMP_S_TABLE
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  const static size_t NumElementsInType = Vectors<TypeUnsigned>::NumElements;
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    PRNG Index;
+    for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+      // Initialize the test vectors.
+      TypeUnsigned Value1, Value2;
+      for (size_t j = 0; j < NumElementsInType;) {
+        Value1[j] = Values[Index() % NumValues];
+        Value2[j] = Values[Index() % NumValues];
+        ++j;
+      }
+      // Perform the test.
+      TypeUnsigned ResultSz = Funcs[f].FuncSz(Value1, Value2);
+      TypeUnsigned ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
+      ++TotalTests;
+      if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "test" << Funcs[f].Name
+                  << Vectors<TypeUnsignedLabel>::TypeName << "("
+                  << vectAsString<TypeUnsignedLabel>(Value1) << ","
+                  << vectAsString<TypeUnsignedLabel>(Value2)
+                  << "): sz=" << vectAsString<TypeUnsignedLabel>(ResultSz)
+                  << " llc=" << vectAsString<TypeUnsignedLabel>(ResultLlc)
+                  << std::endl;
+      }
+    }
+  }
+}
+// Return true on wraparound
+template <typename T> bool incrementI1Vector(typename Vectors<T>::Ty &Vect) {
+  size_t Pos = 0;
+  const static size_t NumElements = Vectors<T>::NumElements;
+  for (Pos = 0; Pos < NumElements; ++Pos) {
+    if (Vect[Pos] == 0) {
+      Vect[Pos] = 1;
+      break;
+    }
+    Vect[Pos] = 0;
+  }
+  return (Pos == NumElements);
+}
+template <typename T>
+void testsVecI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename Vectors<T>::Ty Ty;
+  typedef Ty (*FuncType)(Ty, Ty);
+  static struct {
+    const char *Name;
+    FuncType FuncLlc;
+    FuncType FuncSz;
+  } Funcs[] = {
+#define X(cmp, op)                                                             \
+  { STR(inst), (FuncType)icmpi1##cmp, (FuncType)Subzero_::icmpi1##cmp }        \
+  ,
+        ICMP_U_TABLE
+        ICMP_S_TABLE
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  const static size_t NumElements = Vectors<T>::NumElements;
+  const static size_t MAX_NUMBER_OF_ELEMENTS_FOR_EXHAUSTIVE_TESTING = 8;
+  // Check if the type is small enough to try all possible input pairs.
+  if (NumElements <= MAX_NUMBER_OF_ELEMENTS_FOR_EXHAUSTIVE_TESTING) {
+    for (size_t f = 0; f < NumFuncs; ++f) {
+      Ty Value1, Value2;
+      memset(&Value1, 0, sizeof(Value1));
+      for (bool IsValue1Done = false; !IsValue1Done;
+           IsValue1Done = incrementI1Vector<T>(Value1)) {
+        memset(&Value2, 0, sizeof(Value2));
+        for (bool IsValue2Done = false; !IsValue2Done;
+             IsValue2Done = incrementI1Vector<T>(Value2)) {
+          Ty ResultSz = Funcs[f].FuncSz(Value1, Value2);
+          Ty ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
+          ++TotalTests;
+          if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+            ++Passes;
+          } else {
+            ++Failures;
+            std::cout << "test" << Funcs[f].Name << Vectors<T>::TypeName << "("
+                      << vectAsString<T>(Value1) << ","
+                      << vectAsString<T>(Value2)
+                      << "): sz=" << vectAsString<T>(ResultSz)
+                      << " llc=" << vectAsString<T>(ResultLlc) << std::endl;
+          }
+        }
+      }
+    }
+  } else {
+    for (size_t f = 0; f < NumFuncs; ++f) {
+      PRNG Index;
+      for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+        Ty Value1, Value2;
+        // Initialize the test vectors.
+        for (size_t j = 0; j < NumElements; ++j) {
+          Value1[j] = Index() % 2;
+          Value2[j] = Index() % 2;
+        }
+        // Perform the test.
+        Ty ResultSz = Funcs[f].FuncSz(Value1, Value2);
+        Ty ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
+        ++TotalTests;
+        if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+          ++Passes;
+        } else {
+          ++Failures;
+          std::cout << "test" << Funcs[f].Name << Vectors<T>::TypeName << "("
+                    << vectAsString<T>(Value1) << "," << vectAsString<T>(Value2)
+                    << "): sz=" << vectAsString<T>(ResultSz)
+                    << " llc=" << vectAsString<T>(ResultLlc) << std::endl;
+        }
+      }
+    }
+  }
+}
 int main(int argc, char **argv) {
  size_t TotalTests = 0;
  size_t Passes = 0;
@@ -111,6 +276,12 @@ int main(int argc, char **argv) {
  testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures);
  testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures);
  testsInt<uint64_t, int64_t>(TotalTests, Passes, Failures);
+  testsVecInt<v4ui32, v4si32>(TotalTests, Passes, Failures);
+  testsVecInt<v8ui16, v8si16>(TotalTests, Passes, Failures);
+  testsVecInt<v16ui8, v16si8>(TotalTests, Passes, Failures);
+  testsVecI1<v4i1>(TotalTests, Passes, Failures);
+  testsVecI1<v8i1>(TotalTests, Passes, Failures);
+  testsVecI1<v16i1>(TotalTests, Passes, Failures);
  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
            << " Failures=" << Failures << "\n";

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -2261,6 +2261,124 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
  Operand *Src1 = legalize(Inst->getSrc(1));
  Variable *Dest = Inst->getDest();
+  if (isVectorType(Dest->getType())) {
+    Type Ty = Src0->getType();
+    // Promote i1 vectors to 128 bit integer vector types.
+    if (typeElementType(Ty) == IceType_i1) {
+      Type NewTy = IceType_NUM;
+      switch (Ty) {
+      default:
+        llvm_unreachable("unexpected type");
+        break;
+      case IceType_v4i1:
+        NewTy = IceType_v4i32;
+        break;
+      case IceType_v8i1:
+        NewTy = IceType_v8i16;
+        break;
+      case IceType_v16i1:
+        NewTy = IceType_v16i8;
+        break;
+      }
+      Variable *NewSrc0 = Func->makeVariable(NewTy, Context.getNode());
+      Variable *NewSrc1 = Func->makeVariable(NewTy, Context.getNode());
+      lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
+      lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
+      Src0 = NewSrc0;
+      Src1 = NewSrc1;
+      Ty = NewTy;
+    }
+    InstIcmp::ICond Condition = Inst->getCondition();
+    // SSE2 only has signed comparison operations.  Transform unsigned
+    // inputs in a manner that allows for the use of signed comparison
+    // operations by flipping the high order bits.
+    if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
+        Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
+      Variable *T0 = makeReg(Ty);
+      Variable *T1 = makeReg(Ty);
+      Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
+      _movp(T0, Src0);
+      _pxor(T0, HighOrderBits);
+      _movp(T1, Src1);
+      _pxor(T1, HighOrderBits);
+      Src0 = T0;
+      Src1 = T1;
+    }
+    // TODO: ALIGNHACK: Both operands to compare instructions need to be
+    // in registers until stack alignment support is implemented.  Once
+    // there is support for stack alignment, LEGAL_HACK can be removed.
+#define LEGAL_HACK(Vect) legalizeToVar((Vect))
+    Variable *T = makeReg(Ty);
+    switch (Condition) {
+    default:
+      llvm_unreachable("unexpected condition");
+      break;
+    case InstIcmp::Eq: {
+      _movp(T, Src0);
+      _pcmpeq(T, LEGAL_HACK(Src1));
+    } break;
+    case InstIcmp::Ne: {
+      _movp(T, Src0);
+      _pcmpeq(T, LEGAL_HACK(Src1));
+      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+      _pxor(T, MinusOne);
+    } break;
+    case InstIcmp::Ugt:
+    case InstIcmp::Sgt: {
+      _movp(T, Src0);
+      _pcmpgt(T, LEGAL_HACK(Src1));
+    } break;
+    case InstIcmp::Uge:
+    case InstIcmp::Sge: {
+      // !(Src1 > Src0)
+      _movp(T, Src1);
+      _pcmpgt(T, LEGAL_HACK(Src0));
+      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+      _pxor(T, MinusOne);
+    } break;
+    case InstIcmp::Ult:
+    case InstIcmp::Slt: {
+      _movp(T, Src1);
+      _pcmpgt(T, LEGAL_HACK(Src0));
+    } break;
+    case InstIcmp::Ule:
+    case InstIcmp::Sle: {
+      // !(Src0 > Src1)
+      _movp(T, Src0);
+      _pcmpgt(T, LEGAL_HACK(Src1));
+      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+      _pxor(T, MinusOne);
+    } break;
+    }
+#undef LEGAL_HACK
+    _movp(Dest, T);
+    // The following pattern occurs often in lowered C and C++ code:
+    //
+    //   %cmp     = icmp pred <n x ty> %src0, %src1
+    //   %cmp.ext = sext <n x i1> %cmp to <n x ty>
+    //
+    // We can avoid the sext operation by copying the result from pcmpgt
+    // and pcmpeq, which is already sign extended, to the result of the
+    // sext operation
+    if (InstCast *NextCast =
+            llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
+      if (NextCast->getCastKind() == InstCast::Sext &&
+          NextCast->getSrc(0) == Dest) {
+        _movp(NextCast->getDest(), T);
+        // Skip over the instruction.
+        NextCast->setDeleted();
+        Context.advanceNext();
+      }
+    }
+    return;
+  }
  // If Src1 is an immediate, or known to be a physical register, we can
  // allow Src0 to be a memory operand.  Otherwise, Src0 must be copied into
  // a physical register.  (Actually, either Src0 or Src1 can be chosen for
@@ -3398,9 +3516,14 @@ void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) {
  lowerCall(Call);
 }
+// There is no support for loading or emitting vector constants, so the
+// vector values returned from makeVectorOfZeros, makeVectorOfOnes,
+// etc. are initialized with register operations.
+//
+// TODO(wala): Add limited support for vector constants so that
+// complex initialization in registers is unnecessary.
 Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) {
-  // There is no support for loading or emitting vector constants, so
-  // this value is initialized using register operations.
  Variable *Reg = makeReg(Ty, RegNum);
  // Insert a FakeDef, since otherwise the live range of Reg might
  // be overestimated.
@@ -3409,18 +3532,41 @@ Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) {
  return Reg;
 }
+Variable *TargetX8632::makeVectorOfMinusOnes(Type Ty, int32_t RegNum) {
+  Variable *MinusOnes = makeReg(Ty, RegNum);
+  // Insert a FakeDef so the live range of MinusOnes is not overestimated.
+  Context.insert(InstFakeDef::create(Func, MinusOnes));
+  _pcmpeq(MinusOnes, MinusOnes);
+  return MinusOnes;
+}
 Variable *TargetX8632::makeVectorOfOnes(Type Ty, int32_t RegNum) {
-  // There is no support for loading or emitting vector constants, so
-  // this value is initialized using register operations.
  Variable *Dest = makeVectorOfZeros(Ty, RegNum);
-  Variable *MinusOne = makeReg(Ty);
+  Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-  // Insert a FakeDef so the live range of MinusOne is not overestimated.
-  Context.insert(InstFakeDef::create(Func, MinusOne));
-  _pcmpeq(MinusOne, MinusOne);
  _psub(Dest, MinusOne);
  return Dest;
 }
+Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, int32_t RegNum) {
+  assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
+         Ty == IceType_v16i8);
+  if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
+    Variable *Reg = makeVectorOfOnes(Ty, RegNum);
+    SizeT Shift = typeWidthInBytes(typeElementType(Ty)) * X86_CHAR_BIT - 1;
+    _psll(Reg, Ctx->getConstantInt(IceType_i8, Shift));
+    return Reg;
+  } else {
+    // SSE has no left shift operation for vectors of 8 bit integers.
+    const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
+    Constant *ConstantMask =
+        Ctx->getConstantInt(IceType_i32, HIGH_ORDER_BITS_MASK);
+    Variable *Reg = makeReg(Ty, RegNum);
+    _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
+    _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
+    return Reg;
+  }
+}
 OperandX8632Mem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
                                                           Variable *Slot,
                                                           uint32_t Offset) {

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -153,6 +153,10 @@ protected:
  // Returns a vector in a register with the given constant entries.
  Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
  Variable *makeVectorOfOnes(Type Ty, int32_t RegNum = Variable::NoRegister);
+  Variable *makeVectorOfMinusOnes(Type Ty,
+                                  int32_t RegNum = Variable::NoRegister);
+  Variable *makeVectorOfHighOrderBits(Type Ty,
+                                      int32_t RegNum = Variable::NoRegister);
  // Return a memory operand corresponding to a stack allocated Variable.
  OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,

--- a/tests_lit/llvm2ice_tests/vector-icmp.ll
+++ b/tests_lit/llvm2ice_tests/vector-icmp.ll