Commit ce0ca8f8 by Matt Wala

Lower the fcmp instruction for <4 x float> operands.

Most fcmp conditions map directly to single x86 instructions. For these, the lowering is table driven. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/413053002
parent 9cb61e2f
target triple = "i686-pc-linux-gnu" target triple = "i686-pc-linux-gnu"
; This file is extracted from fp.pnacl.ll in the lit tests, with ; This file is extracted from fp.pnacl.ll and vector-fcmp.ll in the lit
; the "internal" attribute removed from the functions. ; tests, with the "internal" attribute removed from the functions.
define i32 @fcmpFalseFloat(float %a, float %b) { define i32 @fcmpFalseFloat(float %a, float %b) {
entry: entry:
...@@ -322,3 +322,151 @@ entry: ...@@ -322,3 +322,151 @@ entry:
} }
; CHECK: fcmpTrueDouble: ; CHECK: fcmpTrueDouble:
; CHECK: mov {{.*}}, 1 ; CHECK: mov {{.*}}, 1
define <4 x i32> @fcmpFalseVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp false <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpFalseVector:
; CHECK: pxor
}
define <4 x i32> @fcmpOeqVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp oeq <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOeqVector:
; CHECK: cmpeqps
}
define <4 x i32> @fcmpOgeVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp oge <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOgeVector:
; CHECK: cmpleps
}
define <4 x i32> @fcmpOgtVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ogt <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOgtVector:
; CHECK: cmpltps
}
define <4 x i32> @fcmpOleVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ole <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOleVector:
; CHECK: cmpleps
}
define <4 x i32> @fcmpOltVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp olt <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOltVector:
; CHECK: cmpltps
}
define <4 x i32> @fcmpOneVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp one <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOneVector:
; CHECK: cmpneqps
; CHECK: cmpordps
; CHECK: pand
}
define <4 x i32> @fcmpOrdVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ord <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOrdVector:
; CHECK: cmpordps
}
define <4 x i32> @fcmpTrueVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp true <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpTrueVector:
; CHECK: pcmpeqd
}
define <4 x i32> @fcmpUeqVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ueq <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUeqVector:
; CHECK: cmpeqps
; CHECK: cmpunordps
; CHECK: por
}
define <4 x i32> @fcmpUgeVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp uge <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUgeVector:
; CHECK: cmpnltps
}
define <4 x i32> @fcmpUgtVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ugt <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUgtVector:
; CHECK: cmpnleps
}
define <4 x i32> @fcmpUleVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ule <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUleVector:
; CHECK: cmpnltps
}
define <4 x i32> @fcmpUltVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ult <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUltVector:
; CHECK: cmpnleps
}
define <4 x i32> @fcmpUneVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp une <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUneVector:
; CHECK: cmpneqps
}
define <4 x i32> @fcmpUnoVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp uno <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUnoVector:
; CHECK: cmpunordps
}
//===- subzero/crosstest/test_fcmp_main.cpp - Driver for tests ------------===//
//
// The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Driver for cross testing the fcmp bitcode instruction
//
//===----------------------------------------------------------------------===//
/* crosstest.py --test=test_fcmp.pnacl.ll --driver=test_fcmp_main.cpp \ /* crosstest.py --test=test_fcmp.pnacl.ll --driver=test_fcmp_main.cpp \
--prefix=Subzero_ --output=test_fcmp */ --prefix=Subzero_ --output=test_fcmp */
#include <cassert> #include <cassert>
#include <cfloat> #include <cfloat>
#include <cmath> #include <cmath>
#include <cstring>
#include <iostream> #include <iostream>
#include "vectors.h"
#include "test_fcmp.def" #include "test_fcmp.def"
#define X(cmp) \ #define X(cmp) \
extern "C" bool fcmp##cmp##Float(float a, float b); \ extern "C" bool fcmp##cmp##Float(float a, float b); \
extern "C" bool fcmp##cmp##Double(double a, double b); \ extern "C" bool fcmp##cmp##Double(double a, double b); \
extern "C" v4si32 fcmp##cmp##Vector(v4f32 a, v4f32 b); \
extern "C" bool Subzero_fcmp##cmp##Float(float a, float b); \ extern "C" bool Subzero_fcmp##cmp##Float(float a, float b); \
extern "C" bool Subzero_fcmp##cmp##Double(double a, double b); extern "C" bool Subzero_fcmp##cmp##Double(double a, double b); \
extern "C" v4si32 Subzero_fcmp##cmp##Vector(v4f32 a, v4f32 b);
FCMP_TABLE; FCMP_TABLE;
#undef X #undef X
int main(int argc, char **argv) { volatile double *Values;
size_t NumValues;
void initializeValues() {
static const double NegInf = -1.0 / 0.0; static const double NegInf = -1.0 / 0.0;
static const double Zero = 0.0; static const double Zero = 0.0;
static const double Ten = 10.0; static const double Ten = 10.0;
...@@ -30,12 +50,14 @@ int main(int argc, char **argv) { ...@@ -30,12 +50,14 @@ int main(int argc, char **argv) {
assert(NegInf < Zero); assert(NegInf < Zero);
assert(NegInf < PosInf); assert(NegInf < PosInf);
assert(Zero < PosInf); assert(Zero < PosInf);
static volatile double InitValues[] = {NegInf, -Zero, Zero, DBL_MIN,
FLT_MIN, Ten, FLT_MAX, DBL_MAX,
PosInf, Nan, NegNan};
NumValues = sizeof(InitValues) / sizeof(*InitValues);
Values = InitValues;
}
volatile double Values[] = { NegInf, -Zero, Zero, DBL_MIN, FLT_MIN, void testsScalar(size_t &TotalTests, size_t &Passes, size_t &Failures) {
Ten, FLT_MAX, DBL_MAX, PosInf, Nan,
NegNan };
const static size_t NumValues = sizeof(Values) / sizeof(*Values);
typedef bool (*FuncTypeFloat)(float, float); typedef bool (*FuncTypeFloat)(float, float);
typedef bool (*FuncTypeDouble)(double, double); typedef bool (*FuncTypeDouble)(double, double);
static struct { static struct {
...@@ -58,9 +80,7 @@ int main(int argc, char **argv) { ...@@ -58,9 +80,7 @@ int main(int argc, char **argv) {
bool ResultSz, ResultLlc; bool ResultSz, ResultLlc;
size_t TotalTests = 0; assert(Values && NumValues);
size_t Passes = 0;
size_t Failures = 0;
for (size_t f = 0; f < NumFuncs; ++f) { for (size_t f = 0; f < NumFuncs; ++f) {
for (size_t i = 0; i < NumValues; ++i) { for (size_t i = 0; i < NumValues; ++i) {
...@@ -76,7 +96,7 @@ int main(int argc, char **argv) { ...@@ -76,7 +96,7 @@ int main(int argc, char **argv) {
++Failures; ++Failures;
std::cout << Funcs[f].Name << "Float(" << Value1Float << ", " std::cout << Funcs[f].Name << "Float(" << Value1Float << ", "
<< Value2Float << "): sz=" << ResultSz << Value2Float << "): sz=" << ResultSz
<< " llc=" << ResultLlc << std::endl; << " llc=" << ResultLlc << "\n";
} }
++TotalTests; ++TotalTests;
double Value1Double = Values[i]; double Value1Double = Values[i];
...@@ -89,11 +109,66 @@ int main(int argc, char **argv) { ...@@ -89,11 +109,66 @@ int main(int argc, char **argv) {
++Failures; ++Failures;
std::cout << Funcs[f].Name << "Double(" << Value1Double << ", " std::cout << Funcs[f].Name << "Double(" << Value1Double << ", "
<< Value2Double << "): sz=" << ResultSz << Value2Double << "): sz=" << ResultSz
<< " llc=" << ResultLlc << std::endl; << " llc=" << ResultLlc << "\n";
}
}
}
}
}
void testsVector(size_t &TotalTests, size_t &Passes, size_t &Failures) {
typedef v4si32 (*FuncTypeVector)(v4f32, v4f32);
static struct {
const char *Name;
FuncTypeVector FuncVectorSz;
FuncTypeVector FuncVectorLlc;
} Funcs[] = {
#define X(cmp) \
{ "fcmp" STR(cmp), Subzero_fcmp##cmp##Vector, fcmp##cmp##Vector } \
,
FCMP_TABLE
#undef X
};
const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
const static size_t NumElementsInType = 4;
const static size_t MaxTestsPerFunc = 100000;
assert(Values && NumValues);
for (size_t f = 0; f < NumFuncs; ++f) {
PRNG Index;
for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
v4f32 Value1, Value2;
for (size_t j = 0; j < NumElementsInType; ++j) {
Value1[j] = Values[Index() % NumValues];
Value2[j] = Values[Index() % NumValues];
} }
++TotalTests;
v4si32 ResultSz, ResultLlc;
ResultSz = Funcs[f].FuncVectorSz(Value1, Value2);
ResultLlc = Funcs[f].FuncVectorLlc(Value1, Value2);
if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
++Passes;
} else {
++Failures;
std::cout << Funcs[f].Name << "Vector(" << vectAsString<v4f32>(Value1)
<< ", " << vectAsString<v4f32>(Value2)
<< "): sz=" << vectAsString<v4si32>(ResultSz)
<< " llc=" << vectAsString<v4si32>(ResultLlc) << "\n";
} }
} }
} }
}
int main(int argc, char **argv) {
size_t TotalTests = 0;
size_t Passes = 0;
size_t Failures = 0;
initializeValues();
testsScalar(TotalTests, Passes, Failures);
testsVector(TotalTests, Passes, Failures);
std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
<< " Failures=" << Failures << "\n"; << " Failures=" << Failures << "\n";
......
...@@ -36,6 +36,18 @@ const struct InstX8632BrAttributes_ { ...@@ -36,6 +36,18 @@ const struct InstX8632BrAttributes_ {
const size_t InstX8632BrAttributesSize = const size_t InstX8632BrAttributesSize =
llvm::array_lengthof(InstX8632BrAttributes); llvm::array_lengthof(InstX8632BrAttributes);
const struct InstX8632CmppsAttributes_ {
const char *EmitString;
} InstX8632CmppsAttributes[] = {
#define X(tag, emit) \
{ emit } \
,
ICEINSTX8632CMPPS_TABLE
#undef X
};
const size_t InstX8632CmppsAttributesSize =
llvm::array_lengthof(InstX8632CmppsAttributes);
const struct TypeX8632Attributes_ { const struct TypeX8632Attributes_ {
const char *CvtString; // i (integer), s (single FP), d (double FP) const char *CvtString; // i (integer), s (single FP), d (double FP)
const char *SdSsString; // ss, sd, or <blank> const char *SdSsString; // ss, sd, or <blank>
...@@ -149,6 +161,13 @@ InstX8632Cmov::InstX8632Cmov(Cfg *Func, Variable *Dest, Operand *Source, ...@@ -149,6 +161,13 @@ InstX8632Cmov::InstX8632Cmov(Cfg *Func, Variable *Dest, Operand *Source,
addSource(Source); addSource(Source);
} }
InstX8632Cmpps::InstX8632Cmpps(Cfg *Func, Variable *Dest, Operand *Source,
InstX8632Cmpps::CmppsCond Condition)
: InstX8632(Func, InstX8632::Cmpps, 2, Dest), Condition(Condition) {
addSource(Dest);
addSource(Source);
}
InstX8632Cmpxchg::InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr, InstX8632Cmpxchg::InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr,
Variable *Eax, Variable *Desired, Variable *Eax, Variable *Desired,
bool Locked) bool Locked)
...@@ -695,6 +714,28 @@ void InstX8632Cmov::dump(const Cfg *Func) const { ...@@ -695,6 +714,28 @@ void InstX8632Cmov::dump(const Cfg *Func) const {
dumpSources(Func); dumpSources(Func);
} }
void InstX8632Cmpps::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 2);
assert(Condition < InstX8632CmppsAttributesSize);
Str << "\t";
Str << "cmp" << InstX8632CmppsAttributes[Condition].EmitString << "ps"
<< "\t";
getDest()->emit(Func);
Str << ", ";
getSrc(1)->emit(Func);
Str << "\n";
}
void InstX8632Cmpps::dump(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrDump();
assert(Condition < InstX8632CmppsAttributesSize);
dumpDest(Func);
Str << " = cmp" << InstX8632CmppsAttributes[Condition].EmitString << "ps"
<< "\t";
dumpSources(Func);
}
void InstX8632Cmpxchg::emit(const Cfg *Func) const { void InstX8632Cmpxchg::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit(); Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 3); assert(getSrcSize() == 3);
......
...@@ -66,6 +66,18 @@ ...@@ -66,6 +66,18 @@
X(Br_p, "p", "jp") \ X(Br_p, "p", "jp") \
//#define X(tag, dump, emit) //#define X(tag, dump, emit)
#define ICEINSTX8632CMPPS_TABLE \
/* enum value, emit */ \
X(Cmpps_eq, "eq") \
X(Cmpps_lt, "lt") \
X(Cmpps_le, "le") \
X(Cmpps_unord, "unord") \
X(Cmpps_neq, "neq") \
X(Cmpps_nlt, "nlt") \
X(Cmpps_nle, "nle") \
X(Cmpps_ord, "ord") \
//#define X(tag, emit)
#define ICETYPEX8632_TABLE \ #define ICETYPEX8632_TABLE \
/* tag, element type, cvt, sdss, pack, width */ \ /* tag, element type, cvt, sdss, pack, width */ \
X(IceType_void, IceType_void, "?" , "" , "" , "???") \ X(IceType_void, IceType_void, "?" , "" , "" , "???") \
......
...@@ -145,6 +145,7 @@ public: ...@@ -145,6 +145,7 @@ public:
Call, Call,
Cdq, Cdq,
Cmov, Cmov,
Cmpps,
Cmpxchg, Cmpxchg,
Cmpxchg8b, Cmpxchg8b,
Cvt, Cvt,
...@@ -714,6 +715,35 @@ private: ...@@ -714,6 +715,35 @@ private:
BrCond Condition; BrCond Condition;
}; };
// Cmpps instruction - compare packed singled-precision floating point
// values
class InstX8632Cmpps : public InstX8632 {
public:
enum CmppsCond {
#define X(tag, emit) tag,
ICEINSTX8632CMPPS_TABLE
#undef X
Cmpps_Invalid
};
static InstX8632Cmpps *create(Cfg *Func, Variable *Dest, Operand *Source,
CmppsCond Condition) {
return new (Func->allocate<InstX8632Cmpps>())
InstX8632Cmpps(Func, Dest, Source, Condition);
}
virtual void emit(const Cfg *Func) const;
virtual void dump(const Cfg *Func) const;
static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpps); }
private:
InstX8632Cmpps(Cfg *Func, Variable *Dest, Operand *Source, CmppsCond Cond);
InstX8632Cmpps(const InstX8632Cmpps &) LLVM_DELETED_FUNCTION;
InstX8632Cmpps &operator=(const InstX8632Cmpps &) LLVM_DELETED_FUNCTION;
virtual ~InstX8632Cmpps() {}
CmppsCond Condition;
};
// Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest> // Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
// equals eax. If so, the ZF is set and <desired> is stored in <dest>. // equals eax. If so, the ZF is set and <desired> is stored in <dest>.
// If not, ZF is cleared and <dest> is copied to eax (or subregister). // If not, ZF is cleared and <dest> is copied to eax (or subregister).
......
...@@ -27,26 +27,38 @@ namespace Ice { ...@@ -27,26 +27,38 @@ namespace Ice {
namespace { namespace {
// The following table summarizes the logic for lowering the fcmp instruction. // The following table summarizes the logic for lowering the fcmp
// There is one table entry for each of the 16 conditions. A comment in // instruction. There is one table entry for each of the 16 conditions.
// lowerFcmp() describes the lowering template. In the most general case, there //
// is a compare followed by two conditional branches, because some fcmp // The first four columns describe the case when the operands are
// conditions don't map to a single x86 conditional branch. However, in many // floating point scalar values. A comment in lowerFcmp() describes the
// cases it is possible to swap the operands in the comparison and have a single // lowering template. In the most general case, there is a compare
// conditional branch. Since it's quite tedious to validate the table by hand, // followed by two conditional branches, because some fcmp conditions
// good execution tests are helpful. // don't map to a single x86 conditional branch. However, in many cases
// it is possible to swap the operands in the comparison and have a
// single conditional branch. Since it's quite tedious to validate the
// table by hand, good execution tests are helpful.
//
// The last two columns describe the case when the operands are vectors
// of floating point values. For most fcmp conditions, there is a clear
// mapping to a single x86 cmpps instruction variant. Some fcmp
// conditions require special code to handle and these are marked in the
// table with a Cmpps_Invalid predicate.
const struct TableFcmp_ { const struct TableFcmp_ {
uint32_t Default; uint32_t Default;
bool SwapOperands; bool SwapScalarOperands;
InstX8632::BrCond C1, C2; InstX8632::BrCond C1, C2;
bool SwapVectorOperands;
InstX8632Cmpps::CmppsCond Predicate;
} TableFcmp[] = { } TableFcmp[] = {
#define X(val, dflt, swap, C1, C2) \ #define X(val, dflt, swapS, C1, C2, swapV, pred) \
{ dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \ { \
dflt, swapS, InstX8632Br::C1, InstX8632Br::C2, swapV, InstX8632Cmpps::pred \
} \
, ,
FCMPX8632_TABLE FCMPX8632_TABLE
#undef X #undef X
}; };
const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp); const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);
// The following table summarizes the logic for lowering the icmp instruction // The following table summarizes the logic for lowering the icmp instruction
...@@ -138,7 +150,7 @@ void xMacroIntegrityCheck() { ...@@ -138,7 +150,7 @@ void xMacroIntegrityCheck() {
// Define a temporary set of enum values based on low-level // Define a temporary set of enum values based on low-level
// table entries. // table entries.
enum _tmp_enum { enum _tmp_enum {
#define X(val, dflt, swap, C1, C2) _tmp_##val, #define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
FCMPX8632_TABLE FCMPX8632_TABLE
#undef X #undef X
_num _num
...@@ -149,7 +161,7 @@ void xMacroIntegrityCheck() { ...@@ -149,7 +161,7 @@ void xMacroIntegrityCheck() {
#undef X #undef X
// Define a set of constants based on low-level table entries, // Define a set of constants based on low-level table entries,
// and ensure the table entry keys are consistent. // and ensure the table entry keys are consistent.
#define X(val, dflt, swap, C1, C2) \ #define X(val, dflt, swapS, C1, C2, swapV, pred) \
static const int _table2_##val = _tmp_##val; \ static const int _table2_##val = _tmp_##val; \
STATIC_ASSERT(_table1_##val == _table2_##val); STATIC_ASSERT(_table1_##val == _table2_##val);
FCMPX8632_TABLE; FCMPX8632_TABLE;
...@@ -2213,6 +2225,68 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) { ...@@ -2213,6 +2225,68 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
Operand *Src0 = Inst->getSrc(0); Operand *Src0 = Inst->getSrc(0);
Operand *Src1 = Inst->getSrc(1); Operand *Src1 = Inst->getSrc(1);
Variable *Dest = Inst->getDest(); Variable *Dest = Inst->getDest();
if (isVectorType(Dest->getType())) {
InstFcmp::FCond Condition = Inst->getCondition();
size_t Index = static_cast<size_t>(Condition);
assert(Index < TableFcmpSize);
if (TableFcmp[Index].SwapVectorOperands) {
Operand *T = Src0;
Src0 = Src1;
Src1 = T;
}
Variable *T = NULL;
// ALIGNHACK: Without support for stack alignment, both operands to
// cmpps need to be forced into registers. Once support for stack
// alignment is implemented, remove LEGAL_HACK.
#define LEGAL_HACK(Vect) legalizeToVar((Vect))
switch (Condition) {
default: {
InstX8632Cmpps::CmppsCond Predicate = TableFcmp[Index].Predicate;
assert(Predicate != InstX8632Cmpps::Cmpps_Invalid);
T = makeReg(Src0->getType());
_movp(T, Src0);
_cmpps(T, LEGAL_HACK(Src1), Predicate);
} break;
case InstFcmp::False:
T = makeVectorOfZeros(Src0->getType());
break;
case InstFcmp::One: {
// Check both unequal and ordered.
T = makeReg(Src0->getType());
Variable *T2 = makeReg(Src0->getType());
Src1 = LEGAL_HACK(Src1);
_movp(T, Src0);
_cmpps(T, Src1, InstX8632Cmpps::Cmpps_neq);
_movp(T2, Src0);
_cmpps(T2, Src1, InstX8632Cmpps::Cmpps_ord);
_pand(T, T2);
} break;
case InstFcmp::Ueq: {
// Check both equal or unordered.
T = makeReg(Src0->getType());
Variable *T2 = makeReg(Src0->getType());
Src1 = LEGAL_HACK(Src1);
_movp(T, Src0);
_cmpps(T, Src1, InstX8632Cmpps::Cmpps_eq);
_movp(T2, Src0);
_cmpps(T2, Src1, InstX8632Cmpps::Cmpps_unord);
_por(T, T2);
} break;
case InstFcmp::True:
T = makeVectorOfMinusOnes(IceType_v4i32);
break;
}
#undef LEGAL_HACK
_movp(Dest, T);
eliminateNextVectorSextInstruction(Dest);
return;
}
// Lowering a = fcmp cond, b, c // Lowering a = fcmp cond, b, c
// ucomiss b, c /* only if C1 != Br_None */ // ucomiss b, c /* only if C1 != Br_None */
// /* but swap b,c order if SwapOperands==true */ // /* but swap b,c order if SwapOperands==true */
...@@ -2225,7 +2299,7 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) { ...@@ -2225,7 +2299,7 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
InstFcmp::FCond Condition = Inst->getCondition(); InstFcmp::FCond Condition = Inst->getCondition();
size_t Index = static_cast<size_t>(Condition); size_t Index = static_cast<size_t>(Condition);
assert(Index < TableFcmpSize); assert(Index < TableFcmpSize);
if (TableFcmp[Index].SwapOperands) { if (TableFcmp[Index].SwapScalarOperands) {
Operand *Tmp = Src0; Operand *Tmp = Src0;
Src0 = Src1; Src0 = Src1;
Src1 = Tmp; Src1 = Tmp;
...@@ -2356,26 +2430,7 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) { ...@@ -2356,26 +2430,7 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
#undef LEGAL_HACK #undef LEGAL_HACK
_movp(Dest, T); _movp(Dest, T);
eliminateNextVectorSextInstruction(Dest);
// The following pattern occurs often in lowered C and C++ code:
//
// %cmp = icmp pred <n x ty> %src0, %src1
// %cmp.ext = sext <n x i1> %cmp to <n x ty>
//
// We can avoid the sext operation by copying the result from pcmpgt
// and pcmpeq, which is already sign extended, to the result of the
// sext operation
if (InstCast *NextCast =
llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
if (NextCast->getCastKind() == InstCast::Sext &&
NextCast->getSrc(0) == Dest) {
_movp(NextCast->getDest(), T);
// Skip over the instruction.
NextCast->setDeleted();
Context.advanceNext();
}
}
return; return;
} }
...@@ -3544,6 +3599,28 @@ void TargetX8632::lowerSwitch(const InstSwitch *Inst) { ...@@ -3544,6 +3599,28 @@ void TargetX8632::lowerSwitch(const InstSwitch *Inst) {
_br(Inst->getLabelDefault()); _br(Inst->getLabelDefault());
} }
// The following pattern occurs often in lowered C and C++ code:
//
// %cmp = fcmp/icmp pred <n x ty> %src0, %src1
// %cmp.ext = sext <n x i1> %cmp to <n x ty>
//
// We can eliminate the sext operation by copying the result of pcmpeqd,
// pcmpgtd, or cmpps (which produce sign extended results) to the result
// of the sext operation.
void
TargetX8632::eliminateNextVectorSextInstruction(Variable *SignExtendedResult) {
if (InstCast *NextCast =
llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
if (NextCast->getCastKind() == InstCast::Sext &&
NextCast->getSrc(0) == SignExtendedResult) {
_movp(NextCast->getDest(), legalizeToVar(SignExtendedResult));
// Skip over the instruction.
NextCast->setDeleted();
Context.advanceNext();
}
}
}
void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) { void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) {
const SizeT MaxSrcs = 0; const SizeT MaxSrcs = 0;
Variable *Dest = NULL; Variable *Dest = NULL;
......
...@@ -16,24 +16,25 @@ ...@@ -16,24 +16,25 @@
#define SUBZERO_SRC_ICETARGETLOWERINGX8632_DEF #define SUBZERO_SRC_ICETARGETLOWERINGX8632_DEF
#define FCMPX8632_TABLE \ #define FCMPX8632_TABLE \
/* val, dflt, swap, C1, C2 */ \ /* <---- scalar comparison ----> <- vector comparison -> */ \
X(False, 0, 0, Br_None, Br_None) \ /* val, dflt, swap, C1, C2, swap, predicate */ \
X(Oeq, 0, 0, Br_ne, Br_p) \ X(False, 0, 0, Br_None, Br_None, 0, Cmpps_Invalid) \
X(Ogt, 1, 0, Br_a, Br_None) \ X(Oeq, 0, 0, Br_ne, Br_p, 0, Cmpps_eq) \
X(Oge, 1, 0, Br_ae, Br_None) \ X(Ogt, 1, 0, Br_a, Br_None, 1, Cmpps_lt) \
X(Olt, 1, 1, Br_a, Br_None) \ X(Oge, 1, 0, Br_ae, Br_None, 1, Cmpps_le) \
X(Ole, 1, 1, Br_ae, Br_None) \ X(Olt, 1, 1, Br_a, Br_None, 0, Cmpps_lt) \
X(One, 1, 0, Br_ne, Br_None) \ X(Ole, 1, 1, Br_ae, Br_None, 0, Cmpps_le) \
X(Ord, 1, 0, Br_np, Br_None) \ X(One, 1, 0, Br_ne, Br_None, 0, Cmpps_Invalid) \
X(Ueq, 1, 0, Br_e, Br_None) \ X(Ord, 1, 0, Br_np, Br_None, 0, Cmpps_ord) \
X(Ugt, 1, 1, Br_b, Br_None) \ X(Ueq, 1, 0, Br_e, Br_None, 0, Cmpps_Invalid) \
X(Uge, 1, 1, Br_be, Br_None) \ X(Ugt, 1, 1, Br_b, Br_None, 0, Cmpps_nle) \
X(Ult, 1, 0, Br_b, Br_None) \ X(Uge, 1, 1, Br_be, Br_None, 0, Cmpps_nlt) \
X(Ule, 1, 0, Br_be, Br_None) \ X(Ult, 1, 0, Br_b, Br_None, 1, Cmpps_nle) \
X(Une, 1, 0, Br_ne, Br_p) \ X(Ule, 1, 0, Br_be, Br_None, 1, Cmpps_nlt) \
X(Uno, 1, 0, Br_p, Br_None) \ X(Une, 1, 0, Br_ne, Br_p, 0, Cmpps_neq) \
X(True, 1, 0, Br_None, Br_None) \ X(Uno, 1, 0, Br_p, Br_None, 0, Cmpps_unord) \
//#define X(val, dflt, swap, C1, C2) X(True, 1, 0, Br_None, Br_None, 0, Cmpps_Invalid) \
//#define X(val, dflt, swapS, C1, C2, swapV, pred)
#define ICMPX8632_TABLE \ #define ICMPX8632_TABLE \
/* val, C_32, C1_64, C2_64, C3_64 */ \ /* val, C_32, C1_64, C2_64, C3_64 */ \
......
...@@ -108,6 +108,8 @@ protected: ...@@ -108,6 +108,8 @@ protected:
void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi, void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
Variable *Dest, Operand *Ptr, Operand *Val); Variable *Dest, Operand *Ptr, Operand *Val);
void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
// Operand legalization helpers. To deal with address mode // Operand legalization helpers. To deal with address mode
// constraints, the helpers will create a new Operand and emit // constraints, the helpers will create a new Operand and emit
// instructions that guarantee that the Operand kind is one of those // instructions that guarantee that the Operand kind is one of those
...@@ -212,6 +214,10 @@ protected: ...@@ -212,6 +214,10 @@ protected:
void _cmp(Operand *Src0, Operand *Src1) { void _cmp(Operand *Src0, Operand *Src1) {
Context.insert(InstX8632Icmp::create(Func, Src0, Src1)); Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
} }
void _cmpps(Variable *Dest, Operand *Src0,
InstX8632Cmpps::CmppsCond Condition) {
Context.insert(InstX8632Cmpps::create(Func, Dest, Src0, Condition));
}
void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired, void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
bool Locked) { bool Locked) {
Context.insert( Context.insert(
......
; This file checks support for comparing vector values with the fcmp
; instruction.
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
; RUN: | FileCheck --check-prefix=DUMP %s
; Check that sext elimination occurs when the result of the comparison
; instruction is alrady sign extended. Sign extension to 4 x i32 uses
; the pslld instruction.
define <4 x i32> @sextElimination(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp oeq <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: sextElimination:
; CHECK: cmpeqps
; CHECK-NOT: pslld
}
define <4 x i32> @fcmpFalseVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp false <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpFalseVector:
; CHECK: pxor
}
define <4 x i32> @fcmpOeqVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp oeq <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOeqVector:
; CHECK: cmpeqps
}
define <4 x i32> @fcmpOgeVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp oge <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOgeVector:
; CHECK: cmpleps
}
define <4 x i32> @fcmpOgtVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ogt <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOgtVector:
; CHECK: cmpltps
}
define <4 x i32> @fcmpOleVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ole <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOleVector:
; CHECK: cmpleps
}
define <4 x i32> @fcmpOltVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp olt <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOltVector:
; CHECK: cmpltps
}
define <4 x i32> @fcmpOneVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp one <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOneVector:
; CHECK: cmpneqps
; CHECK: cmpordps
; CHECK: pand
}
define <4 x i32> @fcmpOrdVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ord <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpOrdVector:
; CHECK: cmpordps
}
define <4 x i32> @fcmpTrueVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp true <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpTrueVector:
; CHECK: pcmpeqd
}
define <4 x i32> @fcmpUeqVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ueq <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUeqVector:
; CHECK: cmpeqps
; CHECK: cmpunordps
; CHECK: por
}
define <4 x i32> @fcmpUgeVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp uge <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUgeVector:
; CHECK: cmpnltps
}
define <4 x i32> @fcmpUgtVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ugt <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUgtVector:
; CHECK: cmpnleps
}
define <4 x i32> @fcmpUleVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ule <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUleVector:
; CHECK: cmpnltps
}
define <4 x i32> @fcmpUltVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp ult <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUltVector:
; CHECK: cmpnleps
}
define <4 x i32> @fcmpUneVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp une <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUneVector:
; CHECK: cmpneqps
}
define <4 x i32> @fcmpUnoVector(<4 x float> %a, <4 x float> %b) {
entry:
%res.trunc = fcmp uno <4 x float> %a, %b
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
; CHECK-LABEL: fcmpUnoVector:
; CHECK: cmpunordps
}
; ERRORS-NOT: ICE translation error
; DUMP-NOT: SZ
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment