Commit 9a0168a9 by Matt Wala

Lower icmp operations between vector values.

SSE2 only has signed integer comparison. Unsigned compares are implemented by inverting the sign bits of the operands and doing a signed compare. A common pattern in clang generated IR is a vector compare which generates an i1 vector followed by a sign extension of the result of the compare. The x86 comparison instructions already generate sign extended values, so we can eliminate unnecessary sext operations that follow compares in the IR. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/412593002
parent 87543355
...@@ -67,7 +67,7 @@ for optlevel in ${OPTLEVELS} ; do ...@@ -67,7 +67,7 @@ for optlevel in ${OPTLEVELS} ; do
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \ ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
--dir="${OUTDIR}" \ --dir="${OUTDIR}" \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --llvm-bin-path="${LLVM_BIN_PATH}" \
--test=test_icmp.cpp \ --test=test_icmp.cpp --test=test_icmp_i1vec.ll \
--driver=test_icmp_main.cpp \ --driver=test_icmp_main.cpp \
--output=test_icmp_O${optlevel} --output=test_icmp_O${optlevel}
......
// This aims to test the icmp bitcode instruction across all PNaCl //===- subzero/crosstest/test_icmp.cpp - Implementation for tests ---------===//
// primitive integer types. //
// The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This aims to test the icmp bitcode instruction across all PNaCl primitive
// and SIMD integer types.
//
//===----------------------------------------------------------------------===//
#include <stdint.h> #include <stdint.h>
...@@ -9,13 +20,20 @@ ...@@ -9,13 +20,20 @@
bool icmp##cmp(uint8_t a, uint8_t b) { return a op b; } \ bool icmp##cmp(uint8_t a, uint8_t b) { return a op b; } \
bool icmp##cmp(uint16_t a, uint16_t b) { return a op b; } \ bool icmp##cmp(uint16_t a, uint16_t b) { return a op b; } \
bool icmp##cmp(uint32_t a, uint32_t b) { return a op b; } \ bool icmp##cmp(uint32_t a, uint32_t b) { return a op b; } \
bool icmp##cmp(uint64_t a, uint64_t b) { return a op b; } bool icmp##cmp(uint64_t a, uint64_t b) { return a op b; } \
v4ui32 icmp##cmp(v4ui32 a, v4ui32 b) { return a op b; } \
v8ui16 icmp##cmp(v8ui16 a, v8ui16 b) { return a op b; } \
v16ui8 icmp##cmp(v16ui8 a, v16ui8 b) { return a op b; }
ICMP_U_TABLE ICMP_U_TABLE
#undef X #undef X
#define X(cmp, op) \ #define X(cmp, op) \
bool icmp##cmp(int8_t a, int8_t b) { return a op b; } \ bool icmp##cmp(int8_t a, int8_t b) { return a op b; } \
bool icmp##cmp(int16_t a, int16_t b) { return a op b; } \ bool icmp##cmp(int16_t a, int16_t b) { return a op b; } \
bool icmp##cmp(int32_t a, int32_t b) { return a op b; } \ bool icmp##cmp(int32_t a, int32_t b) { return a op b; } \
bool icmp##cmp(int64_t a, int64_t b) { return a op b; } bool icmp##cmp(int64_t a, int64_t b) { return a op b; } \
v4si32 icmp##cmp(v4si32 a, v4si32 b) { return a op b; } \
v8si16 icmp##cmp(v8si16 a, v8si16 b) { return a op b; } \
v16si8 icmp##cmp(v16si8 a, v16si8 b) { return a op b; }
ICMP_S_TABLE ICMP_S_TABLE
#undef X #undef X
//===- subzero/crosstest/test_icmp.h - Test prototypes -------*- C++ -*----===//
//
// The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file declares the function prototypes for crosstesting the icmp
// bitcode instruction.
//
//===----------------------------------------------------------------------===//
#include "test_icmp.def" #include "test_icmp.def"
#include "vectors.h"
#define X(cmp, op) \ #define X(cmp, op) \
bool icmp##cmp(uint8_t a, uint8_t b); \ bool icmp##cmp(uint8_t a, uint8_t b); \
bool icmp##cmp(uint16_t a, uint16_t b); \ bool icmp##cmp(uint16_t a, uint16_t b); \
bool icmp##cmp(uint32_t a, uint32_t b); \ bool icmp##cmp(uint32_t a, uint32_t b); \
bool icmp##cmp(uint64_t a, uint64_t b); bool icmp##cmp(uint64_t a, uint64_t b); \
v4ui32 icmp##cmp(v4ui32 a, v4ui32 b); \
v8ui16 icmp##cmp(v8ui16 a, v8ui16 b); \
v16ui8 icmp##cmp(v16ui8 a, v16ui8 b);
ICMP_U_TABLE ICMP_U_TABLE
#undef X #undef X
...@@ -12,6 +31,17 @@ ICMP_U_TABLE ...@@ -12,6 +31,17 @@ ICMP_U_TABLE
bool icmp##cmp(int8_t a, int8_t b); \ bool icmp##cmp(int8_t a, int8_t b); \
bool icmp##cmp(int16_t a, int16_t b); \ bool icmp##cmp(int16_t a, int16_t b); \
bool icmp##cmp(int32_t a, int32_t b); \ bool icmp##cmp(int32_t a, int32_t b); \
bool icmp##cmp(int64_t a, int64_t b); bool icmp##cmp(int64_t a, int64_t b); \
v4si32 icmp##cmp(v4si32 a, v4si32 b); \
v8si16 icmp##cmp(v8si16 a, v8si16 b); \
v16si8 icmp##cmp(v16si8 a, v16si8 b);
ICMP_S_TABLE
#undef X
#define X(cmp, op) \
v4si32 icmpi1##cmp(v4si32 a, v4si32 b); \
v8si16 icmpi1##cmp(v8si16 a, v8si16 b); \
v16si8 icmpi1##cmp(v16si8 a, v16si8 b);
ICMP_U_TABLE
ICMP_S_TABLE ICMP_S_TABLE
#undef X #undef X
target triple = "i686-pc-linux-gnu"
define <16 x i8> @_Z8icmpi1EqDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
entry:
%a.trunc = trunc <16 x i8> %a to <16 x i1>
%b.trunc = trunc <16 x i8> %b to <16 x i1>
%cmp = icmp eq <16 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %cmp.sext
}
define <16 x i8> @_Z8icmpi1NeDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
entry:
%a.trunc = trunc <16 x i8> %a to <16 x i1>
%b.trunc = trunc <16 x i8> %b to <16 x i1>
%cmp = icmp ne <16 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %cmp.sext
}
define <16 x i8> @_Z9icmpi1UgtDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
entry:
%a.trunc = trunc <16 x i8> %a to <16 x i1>
%b.trunc = trunc <16 x i8> %b to <16 x i1>
%cmp = icmp ugt <16 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %cmp.sext
}
define <16 x i8> @_Z9icmpi1UgeDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
entry:
%a.trunc = trunc <16 x i8> %a to <16 x i1>
%b.trunc = trunc <16 x i8> %b to <16 x i1>
%cmp = icmp uge <16 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %cmp.sext
}
define <16 x i8> @_Z9icmpi1UltDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
entry:
%a.trunc = trunc <16 x i8> %a to <16 x i1>
%b.trunc = trunc <16 x i8> %b to <16 x i1>
%cmp = icmp ult <16 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %cmp.sext
}
define <16 x i8> @_Z9icmpi1UleDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
entry:
%a.trunc = trunc <16 x i8> %a to <16 x i1>
%b.trunc = trunc <16 x i8> %b to <16 x i1>
%cmp = icmp ule <16 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %cmp.sext
}
define <16 x i8> @_Z9icmpi1SgtDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
entry:
%a.trunc = trunc <16 x i8> %a to <16 x i1>
%b.trunc = trunc <16 x i8> %b to <16 x i1>
%cmp = icmp sgt <16 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %cmp.sext
}
define <16 x i8> @_Z9icmpi1SgeDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
entry:
%a.trunc = trunc <16 x i8> %a to <16 x i1>
%b.trunc = trunc <16 x i8> %b to <16 x i1>
%cmp = icmp sge <16 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %cmp.sext
}
define <16 x i8> @_Z9icmpi1SltDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
entry:
%a.trunc = trunc <16 x i8> %a to <16 x i1>
%b.trunc = trunc <16 x i8> %b to <16 x i1>
%cmp = icmp slt <16 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %cmp.sext
}
define <16 x i8> @_Z9icmpi1SleDv16_aS_(<16 x i8> %a, <16 x i8> %b) {
entry:
%a.trunc = trunc <16 x i8> %a to <16 x i1>
%b.trunc = trunc <16 x i8> %b to <16 x i1>
%cmp = icmp sle <16 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %cmp.sext
}
define <8 x i16> @_Z8icmpi1EqDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
entry:
%a.trunc = trunc <8 x i16> %a to <8 x i1>
%b.trunc = trunc <8 x i16> %b to <8 x i1>
%cmp = icmp eq <8 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %cmp.sext
}
define <8 x i16> @_Z8icmpi1NeDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
entry:
%a.trunc = trunc <8 x i16> %a to <8 x i1>
%b.trunc = trunc <8 x i16> %b to <8 x i1>
%cmp = icmp ne <8 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %cmp.sext
}
define <8 x i16> @_Z9icmpi1UgtDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
entry:
%a.trunc = trunc <8 x i16> %a to <8 x i1>
%b.trunc = trunc <8 x i16> %b to <8 x i1>
%cmp = icmp ugt <8 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %cmp.sext
}
define <8 x i16> @_Z9icmpi1UgeDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
entry:
%a.trunc = trunc <8 x i16> %a to <8 x i1>
%b.trunc = trunc <8 x i16> %b to <8 x i1>
%cmp = icmp uge <8 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %cmp.sext
}
define <8 x i16> @_Z9icmpi1UltDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
entry:
%a.trunc = trunc <8 x i16> %a to <8 x i1>
%b.trunc = trunc <8 x i16> %b to <8 x i1>
%cmp = icmp ult <8 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %cmp.sext
}
define <8 x i16> @_Z9icmpi1UleDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
entry:
%a.trunc = trunc <8 x i16> %a to <8 x i1>
%b.trunc = trunc <8 x i16> %b to <8 x i1>
%cmp = icmp ule <8 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %cmp.sext
}
define <8 x i16> @_Z9icmpi1SgtDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
entry:
%a.trunc = trunc <8 x i16> %a to <8 x i1>
%b.trunc = trunc <8 x i16> %b to <8 x i1>
%cmp = icmp sgt <8 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %cmp.sext
}
define <8 x i16> @_Z9icmpi1SgeDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
entry:
%a.trunc = trunc <8 x i16> %a to <8 x i1>
%b.trunc = trunc <8 x i16> %b to <8 x i1>
%cmp = icmp sge <8 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %cmp.sext
}
define <8 x i16> @_Z9icmpi1SltDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
entry:
%a.trunc = trunc <8 x i16> %a to <8 x i1>
%b.trunc = trunc <8 x i16> %b to <8 x i1>
%cmp = icmp slt <8 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %cmp.sext
}
define <8 x i16> @_Z9icmpi1SleDv8_sS_(<8 x i16> %a, <8 x i16> %b) {
entry:
%a.trunc = trunc <8 x i16> %a to <8 x i1>
%b.trunc = trunc <8 x i16> %b to <8 x i1>
%cmp = icmp sle <8 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %cmp.sext
}
define <4 x i32> @_Z8icmpi1EqDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
entry:
%a.trunc = trunc <4 x i32> %a to <4 x i1>
%b.trunc = trunc <4 x i32> %b to <4 x i1>
%cmp = icmp eq <4 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %cmp.sext
}
define <4 x i32> @_Z8icmpi1NeDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
entry:
%a.trunc = trunc <4 x i32> %a to <4 x i1>
%b.trunc = trunc <4 x i32> %b to <4 x i1>
%cmp = icmp ne <4 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %cmp.sext
}
define <4 x i32> @_Z9icmpi1UgtDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
entry:
%a.trunc = trunc <4 x i32> %a to <4 x i1>
%b.trunc = trunc <4 x i32> %b to <4 x i1>
%cmp = icmp ugt <4 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %cmp.sext
}
define <4 x i32> @_Z9icmpi1UgeDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
entry:
%a.trunc = trunc <4 x i32> %a to <4 x i1>
%b.trunc = trunc <4 x i32> %b to <4 x i1>
%cmp = icmp uge <4 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %cmp.sext
}
define <4 x i32> @_Z9icmpi1UltDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
entry:
%a.trunc = trunc <4 x i32> %a to <4 x i1>
%b.trunc = trunc <4 x i32> %b to <4 x i1>
%cmp = icmp ult <4 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %cmp.sext
}
define <4 x i32> @_Z9icmpi1UleDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
entry:
%a.trunc = trunc <4 x i32> %a to <4 x i1>
%b.trunc = trunc <4 x i32> %b to <4 x i1>
%cmp = icmp ule <4 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %cmp.sext
}
define <4 x i32> @_Z9icmpi1SgtDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
entry:
%a.trunc = trunc <4 x i32> %a to <4 x i1>
%b.trunc = trunc <4 x i32> %b to <4 x i1>
%cmp = icmp sgt <4 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %cmp.sext
}
define <4 x i32> @_Z9icmpi1SgeDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
entry:
%a.trunc = trunc <4 x i32> %a to <4 x i1>
%b.trunc = trunc <4 x i32> %b to <4 x i1>
%cmp = icmp sge <4 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %cmp.sext
}
define <4 x i32> @_Z9icmpi1SltDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
entry:
%a.trunc = trunc <4 x i32> %a to <4 x i1>
%b.trunc = trunc <4 x i32> %b to <4 x i1>
%cmp = icmp slt <4 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %cmp.sext
}
define <4 x i32> @_Z9icmpi1SleDv4_iS_(<4 x i32> %a, <4 x i32> %b) {
entry:
%a.trunc = trunc <4 x i32> %a to <4 x i1>
%b.trunc = trunc <4 x i32> %b to <4 x i1>
%cmp = icmp sle <4 x i1> %a.trunc, %b.trunc
%cmp.sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %cmp.sext
}
/* crosstest.py --test=test_icmp.cpp --driver=test_icmp_main.cpp \ //===- subzero/crosstest/test_icmp_main.cpp - Driver for tests. -----------===//
--prefix=Subzero_ --output=test_icmp */ //
// The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Driver for cross testing the icmp bitcode instruction
//
//===----------------------------------------------------------------------===//
/* crosstest.py --test=test_icmp.cpp --test=test_icmp_i1vec.ll \
--driver=test_icmp_main.cpp --prefix=Subzero_ --output=test_icmp */
#include <climits> // CHAR_BIT
#include <cstring> // memcmp, memset
#include <stdint.h> #include <stdint.h>
#include <iostream> #include <iostream>
...@@ -63,8 +78,9 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) { ...@@ -63,8 +78,9 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
++Passes; ++Passes;
} else { } else {
++Failures; ++Failures;
std::cout << "icmp" << Funcs[f].Name << (8 * sizeof(TypeUnsigned)) std::cout << "icmp" << Funcs[f].Name
<< "(" << Value1 << ", " << Value2 << "): sz=" << ResultSz << (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value1
<< ", " << Value2 << "): sz=" << ResultSz
<< " llc=" << ResultLlc << std::endl; << " llc=" << ResultLlc << std::endl;
} }
} }
...@@ -90,8 +106,8 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) { ...@@ -90,8 +106,8 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
} else { } else {
++Failures; ++Failures;
std::cout << "icmp" << Funcs[f].Name std::cout << "icmp" << Funcs[f].Name
<< (8 * sizeof(TypeUnsigned)) << "(" << Value1 << ", " << (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value1
<< Value2 << "): sz=" << ResultSz << ", " << Value2 << "): sz=" << ResultSz
<< " llc=" << ResultLlc << std::endl; << " llc=" << ResultLlc << std::endl;
} }
} }
...@@ -102,6 +118,155 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) { ...@@ -102,6 +118,155 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
} }
} }
const static size_t MaxTestsPerFunc = 100000;
template <typename TypeUnsignedLabel, typename TypeSignedLabel>
void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
typedef typename Vectors<TypeUnsignedLabel>::Ty TypeUnsigned;
typedef typename Vectors<TypeSignedLabel>::Ty TypeSigned;
typedef TypeUnsigned (*FuncTypeUnsigned)(TypeUnsigned, TypeUnsigned);
typedef TypeSigned (*FuncTypeSigned)(TypeSigned, TypeSigned);
static struct {
const char *Name;
FuncTypeUnsigned FuncLlc;
FuncTypeUnsigned FuncSz;
} Funcs[] = {
#define X(cmp, op) \
{ \
STR(inst), (FuncTypeUnsigned)icmp##cmp, \
(FuncTypeUnsigned)Subzero_::icmp##cmp \
} \
,
ICMP_U_TABLE
#undef X
#define X(cmp, op) \
{ \
STR(inst), (FuncTypeUnsigned)(FuncTypeSigned)icmp##cmp, \
(FuncTypeUnsigned)(FuncTypeSigned)Subzero_::icmp##cmp \
} \
,
ICMP_S_TABLE
#undef X
};
const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
const static size_t NumElementsInType = Vectors<TypeUnsigned>::NumElements;
for (size_t f = 0; f < NumFuncs; ++f) {
PRNG Index;
for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
// Initialize the test vectors.
TypeUnsigned Value1, Value2;
for (size_t j = 0; j < NumElementsInType;) {
Value1[j] = Values[Index() % NumValues];
Value2[j] = Values[Index() % NumValues];
++j;
}
// Perform the test.
TypeUnsigned ResultSz = Funcs[f].FuncSz(Value1, Value2);
TypeUnsigned ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
++TotalTests;
if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
++Passes;
} else {
++Failures;
std::cout << "test" << Funcs[f].Name
<< Vectors<TypeUnsignedLabel>::TypeName << "("
<< vectAsString<TypeUnsignedLabel>(Value1) << ","
<< vectAsString<TypeUnsignedLabel>(Value2)
<< "): sz=" << vectAsString<TypeUnsignedLabel>(ResultSz)
<< " llc=" << vectAsString<TypeUnsignedLabel>(ResultLlc)
<< std::endl;
}
}
}
}
// Return true on wraparound
template <typename T> bool incrementI1Vector(typename Vectors<T>::Ty &Vect) {
size_t Pos = 0;
const static size_t NumElements = Vectors<T>::NumElements;
for (Pos = 0; Pos < NumElements; ++Pos) {
if (Vect[Pos] == 0) {
Vect[Pos] = 1;
break;
}
Vect[Pos] = 0;
}
return (Pos == NumElements);
}
template <typename T>
void testsVecI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
typedef typename Vectors<T>::Ty Ty;
typedef Ty (*FuncType)(Ty, Ty);
static struct {
const char *Name;
FuncType FuncLlc;
FuncType FuncSz;
} Funcs[] = {
#define X(cmp, op) \
{ STR(inst), (FuncType)icmpi1##cmp, (FuncType)Subzero_::icmpi1##cmp } \
,
ICMP_U_TABLE
ICMP_S_TABLE
};
const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
const static size_t NumElements = Vectors<T>::NumElements;
const static size_t MAX_NUMBER_OF_ELEMENTS_FOR_EXHAUSTIVE_TESTING = 8;
// Check if the type is small enough to try all possible input pairs.
if (NumElements <= MAX_NUMBER_OF_ELEMENTS_FOR_EXHAUSTIVE_TESTING) {
for (size_t f = 0; f < NumFuncs; ++f) {
Ty Value1, Value2;
memset(&Value1, 0, sizeof(Value1));
for (bool IsValue1Done = false; !IsValue1Done;
IsValue1Done = incrementI1Vector<T>(Value1)) {
memset(&Value2, 0, sizeof(Value2));
for (bool IsValue2Done = false; !IsValue2Done;
IsValue2Done = incrementI1Vector<T>(Value2)) {
Ty ResultSz = Funcs[f].FuncSz(Value1, Value2);
Ty ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
++TotalTests;
if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
++Passes;
} else {
++Failures;
std::cout << "test" << Funcs[f].Name << Vectors<T>::TypeName << "("
<< vectAsString<T>(Value1) << ","
<< vectAsString<T>(Value2)
<< "): sz=" << vectAsString<T>(ResultSz)
<< " llc=" << vectAsString<T>(ResultLlc) << std::endl;
}
}
}
}
} else {
for (size_t f = 0; f < NumFuncs; ++f) {
PRNG Index;
for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
Ty Value1, Value2;
// Initialize the test vectors.
for (size_t j = 0; j < NumElements; ++j) {
Value1[j] = Index() % 2;
Value2[j] = Index() % 2;
}
// Perform the test.
Ty ResultSz = Funcs[f].FuncSz(Value1, Value2);
Ty ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
++TotalTests;
if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
++Passes;
} else {
++Failures;
std::cout << "test" << Funcs[f].Name << Vectors<T>::TypeName << "("
<< vectAsString<T>(Value1) << "," << vectAsString<T>(Value2)
<< "): sz=" << vectAsString<T>(ResultSz)
<< " llc=" << vectAsString<T>(ResultLlc) << std::endl;
}
}
}
}
}
int main(int argc, char **argv) { int main(int argc, char **argv) {
size_t TotalTests = 0; size_t TotalTests = 0;
size_t Passes = 0; size_t Passes = 0;
...@@ -111,6 +276,12 @@ int main(int argc, char **argv) { ...@@ -111,6 +276,12 @@ int main(int argc, char **argv) {
testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures); testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures);
testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures); testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures);
testsInt<uint64_t, int64_t>(TotalTests, Passes, Failures); testsInt<uint64_t, int64_t>(TotalTests, Passes, Failures);
testsVecInt<v4ui32, v4si32>(TotalTests, Passes, Failures);
testsVecInt<v8ui16, v8si16>(TotalTests, Passes, Failures);
testsVecInt<v16ui8, v16si8>(TotalTests, Passes, Failures);
testsVecI1<v4i1>(TotalTests, Passes, Failures);
testsVecI1<v8i1>(TotalTests, Passes, Failures);
testsVecI1<v16i1>(TotalTests, Passes, Failures);
std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
<< " Failures=" << Failures << "\n"; << " Failures=" << Failures << "\n";
......
...@@ -2261,6 +2261,124 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) { ...@@ -2261,6 +2261,124 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
Operand *Src1 = legalize(Inst->getSrc(1)); Operand *Src1 = legalize(Inst->getSrc(1));
Variable *Dest = Inst->getDest(); Variable *Dest = Inst->getDest();
if (isVectorType(Dest->getType())) {
Type Ty = Src0->getType();
// Promote i1 vectors to 128 bit integer vector types.
if (typeElementType(Ty) == IceType_i1) {
Type NewTy = IceType_NUM;
switch (Ty) {
default:
llvm_unreachable("unexpected type");
break;
case IceType_v4i1:
NewTy = IceType_v4i32;
break;
case IceType_v8i1:
NewTy = IceType_v8i16;
break;
case IceType_v16i1:
NewTy = IceType_v16i8;
break;
}
Variable *NewSrc0 = Func->makeVariable(NewTy, Context.getNode());
Variable *NewSrc1 = Func->makeVariable(NewTy, Context.getNode());
lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
Src0 = NewSrc0;
Src1 = NewSrc1;
Ty = NewTy;
}
InstIcmp::ICond Condition = Inst->getCondition();
// SSE2 only has signed comparison operations. Transform unsigned
// inputs in a manner that allows for the use of signed comparison
// operations by flipping the high order bits.
if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
Variable *T0 = makeReg(Ty);
Variable *T1 = makeReg(Ty);
Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
_movp(T0, Src0);
_pxor(T0, HighOrderBits);
_movp(T1, Src1);
_pxor(T1, HighOrderBits);
Src0 = T0;
Src1 = T1;
}
// TODO: ALIGNHACK: Both operands to compare instructions need to be
// in registers until stack alignment support is implemented. Once
// there is support for stack alignment, LEGAL_HACK can be removed.
#define LEGAL_HACK(Vect) legalizeToVar((Vect))
Variable *T = makeReg(Ty);
switch (Condition) {
default:
llvm_unreachable("unexpected condition");
break;
case InstIcmp::Eq: {
_movp(T, Src0);
_pcmpeq(T, LEGAL_HACK(Src1));
} break;
case InstIcmp::Ne: {
_movp(T, Src0);
_pcmpeq(T, LEGAL_HACK(Src1));
Variable *MinusOne = makeVectorOfMinusOnes(Ty);
_pxor(T, MinusOne);
} break;
case InstIcmp::Ugt:
case InstIcmp::Sgt: {
_movp(T, Src0);
_pcmpgt(T, LEGAL_HACK(Src1));
} break;
case InstIcmp::Uge:
case InstIcmp::Sge: {
// !(Src1 > Src0)
_movp(T, Src1);
_pcmpgt(T, LEGAL_HACK(Src0));
Variable *MinusOne = makeVectorOfMinusOnes(Ty);
_pxor(T, MinusOne);
} break;
case InstIcmp::Ult:
case InstIcmp::Slt: {
_movp(T, Src1);
_pcmpgt(T, LEGAL_HACK(Src0));
} break;
case InstIcmp::Ule:
case InstIcmp::Sle: {
// !(Src0 > Src1)
_movp(T, Src0);
_pcmpgt(T, LEGAL_HACK(Src1));
Variable *MinusOne = makeVectorOfMinusOnes(Ty);
_pxor(T, MinusOne);
} break;
}
#undef LEGAL_HACK
_movp(Dest, T);
// The following pattern occurs often in lowered C and C++ code:
//
// %cmp = icmp pred <n x ty> %src0, %src1
// %cmp.ext = sext <n x i1> %cmp to <n x ty>
//
// We can avoid the sext operation by copying the result from pcmpgt
// and pcmpeq, which is already sign extended, to the result of the
// sext operation
if (InstCast *NextCast =
llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
if (NextCast->getCastKind() == InstCast::Sext &&
NextCast->getSrc(0) == Dest) {
_movp(NextCast->getDest(), T);
// Skip over the instruction.
NextCast->setDeleted();
Context.advanceNext();
}
}
return;
}
// If Src1 is an immediate, or known to be a physical register, we can // If Src1 is an immediate, or known to be a physical register, we can
// allow Src0 to be a memory operand. Otherwise, Src0 must be copied into // allow Src0 to be a memory operand. Otherwise, Src0 must be copied into
// a physical register. (Actually, either Src0 or Src1 can be chosen for // a physical register. (Actually, either Src0 or Src1 can be chosen for
...@@ -3398,9 +3516,14 @@ void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) { ...@@ -3398,9 +3516,14 @@ void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) {
lowerCall(Call); lowerCall(Call);
} }
// There is no support for loading or emitting vector constants, so the
// vector values returned from makeVectorOfZeros, makeVectorOfOnes,
// etc. are initialized with register operations.
//
// TODO(wala): Add limited support for vector constants so that
// complex initialization in registers is unnecessary.
Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) { Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) {
// There is no support for loading or emitting vector constants, so
// this value is initialized using register operations.
Variable *Reg = makeReg(Ty, RegNum); Variable *Reg = makeReg(Ty, RegNum);
// Insert a FakeDef, since otherwise the live range of Reg might // Insert a FakeDef, since otherwise the live range of Reg might
// be overestimated. // be overestimated.
...@@ -3409,18 +3532,41 @@ Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) { ...@@ -3409,18 +3532,41 @@ Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) {
return Reg; return Reg;
} }
Variable *TargetX8632::makeVectorOfMinusOnes(Type Ty, int32_t RegNum) {
Variable *MinusOnes = makeReg(Ty, RegNum);
// Insert a FakeDef so the live range of MinusOnes is not overestimated.
Context.insert(InstFakeDef::create(Func, MinusOnes));
_pcmpeq(MinusOnes, MinusOnes);
return MinusOnes;
}
Variable *TargetX8632::makeVectorOfOnes(Type Ty, int32_t RegNum) { Variable *TargetX8632::makeVectorOfOnes(Type Ty, int32_t RegNum) {
// There is no support for loading or emitting vector constants, so
// this value is initialized using register operations.
Variable *Dest = makeVectorOfZeros(Ty, RegNum); Variable *Dest = makeVectorOfZeros(Ty, RegNum);
Variable *MinusOne = makeReg(Ty); Variable *MinusOne = makeVectorOfMinusOnes(Ty);
// Insert a FakeDef so the live range of MinusOne is not overestimated.
Context.insert(InstFakeDef::create(Func, MinusOne));
_pcmpeq(MinusOne, MinusOne);
_psub(Dest, MinusOne); _psub(Dest, MinusOne);
return Dest; return Dest;
} }
Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, int32_t RegNum) {
assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
Ty == IceType_v16i8);
if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
Variable *Reg = makeVectorOfOnes(Ty, RegNum);
SizeT Shift = typeWidthInBytes(typeElementType(Ty)) * X86_CHAR_BIT - 1;
_psll(Reg, Ctx->getConstantInt(IceType_i8, Shift));
return Reg;
} else {
// SSE has no left shift operation for vectors of 8 bit integers.
const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
Constant *ConstantMask =
Ctx->getConstantInt(IceType_i32, HIGH_ORDER_BITS_MASK);
Variable *Reg = makeReg(Ty, RegNum);
_movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
_pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
return Reg;
}
}
OperandX8632Mem *TargetX8632::getMemoryOperandForStackSlot(Type Ty, OperandX8632Mem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
Variable *Slot, Variable *Slot,
uint32_t Offset) { uint32_t Offset) {
......
...@@ -153,6 +153,10 @@ protected: ...@@ -153,6 +153,10 @@ protected:
// Returns a vector in a register with the given constant entries. // Returns a vector in a register with the given constant entries.
Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister); Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
Variable *makeVectorOfOnes(Type Ty, int32_t RegNum = Variable::NoRegister); Variable *makeVectorOfOnes(Type Ty, int32_t RegNum = Variable::NoRegister);
Variable *makeVectorOfMinusOnes(Type Ty,
int32_t RegNum = Variable::NoRegister);
Variable *makeVectorOfHighOrderBits(Type Ty,
int32_t RegNum = Variable::NoRegister);
// Return a memory operand corresponding to a stack allocated Variable. // Return a memory operand corresponding to a stack allocated Variable.
OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot, OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment