Commit 9cb61e2f by Matt Wala

Lower the select instruction when the operands are of vector type.

Select of vectors is implemented by appropriately masking and combining the inputs with sign extend / bitwise operations and without the use of branches. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/417653004
parent 656d1767
......@@ -74,6 +74,13 @@ for optlevel in ${OPTLEVELS} ; do
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
--dir="${OUTDIR}" \
--llvm-bin-path="${LLVM_BIN_PATH}" \
--test=test_select.ll \
--driver=test_select_main.cpp \
--output=test_select_O${optlevel}
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
--dir="${OUTDIR}" \
--llvm-bin-path="${LLVM_BIN_PATH}" \
--test=test_stacksave.c \
--driver=test_stacksave_main.c \
--output=test_stacksave_O${optlevel}
......@@ -107,6 +114,7 @@ for optlevel in ${OPTLEVELS} ; do
"${OUTDIR}"/test_fcmp_O${optlevel}
"${OUTDIR}"/test_global_O${optlevel}
"${OUTDIR}"/test_icmp_O${optlevel}
"${OUTDIR}"/test_select_O${optlevel}
"${OUTDIR}"/test_stacksave_O${optlevel}
"${OUTDIR}"/test_sync_atomic_O${optlevel}
"${OUTDIR}"/test_vector_ops_O${optlevel}
......
//===- subzero/crosstest/test_select.h - Test prototypes -----*- C++ -*----===//
//
// The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file declares the function prototypes for cross testing the select
// bitcode instruction.
//
//===----------------------------------------------------------------------===//
#include "vectors.h"
v4f32 select(v4si32 cond, v4f32 val1, v4f32 val2);
v4si32 select(v4si32 cond, v4si32 val1, v4si32 val2);
v4ui32 select(v4si32 cond, v4ui32 val1, v4ui32 val2);
v8si16 select(v8si16 cond, v8si16 val1, v8si16 val2);
v8ui16 select(v8si16 cond, v8ui16 val1, v8ui16 val2);
v16si8 select(v16si8 cond, v16si8 val1, v16si8 val2);
v16ui8 select(v16si8 cond, v16ui8 val1, v16ui8 val2);
v4si32 select_i1(v4si32 cond, v4si32 val1, v4si32 val2);
v8si16 select_i1(v8si16 cond, v8si16 val1, v8si16 val2);
v16si8 select_i1(v16si8 cond, v16si8 val1, v16si8 val2);
target triple = "i686-pc-linux-gnu"
define <4 x float> @_Z6selectDv4_iDv4_fS0_(<4 x i32> %cond.ext, <4 x float> %arg1, <4 x float> %arg2) {
entry:
%cond = trunc <4 x i32> %cond.ext to <4 x i1>
%res = select <4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2
ret <4 x float> %res
}
define <4 x i32> @_Z6selectDv4_iS_S_(<4 x i32> %cond.ext, <4 x i32> %arg1, <4 x i32> %arg2) {
entry:
%cond = trunc <4 x i32> %cond.ext to <4 x i1>
%res = select <4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2
ret <4 x i32> %res
}
define <4 x i32> @_Z6selectDv4_iDv4_jS0_(<4 x i32> %cond.ext, <4 x i32> %arg1, <4 x i32> %arg2) {
entry:
%cond = trunc <4 x i32> %cond.ext to <4 x i1>
%res = select <4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2
ret <4 x i32> %res
}
define <8 x i16> @_Z6selectDv8_sS_S_(<8 x i16> %cond.ext, <8 x i16> %arg1, <8 x i16> %arg2) {
entry:
%cond = trunc <8 x i16> %cond.ext to <8 x i1>
%res = select <8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2
ret <8 x i16> %res
}
define <8 x i16> @_Z6selectDv8_sDv8_tS0_(<8 x i16> %cond.ext, <8 x i16> %arg1, <8 x i16> %arg2) {
entry:
%cond = trunc <8 x i16> %cond.ext to <8 x i1>
%res = select <8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2
ret <8 x i16> %res
}
define <16 x i8> @_Z6selectDv16_aS_S_(<16 x i8> %cond.ext, <16 x i8> %arg1, <16 x i8> %arg2) {
entry:
%cond = trunc <16 x i8> %cond.ext to <16 x i1>
%res = select <16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2
ret <16 x i8> %res
}
define <16 x i8> @_Z6selectDv16_aDv16_hS0_(<16 x i8> %cond.ext, <16 x i8> %arg1, <16 x i8> %arg2) {
entry:
%cond = trunc <16 x i8> %cond.ext to <16 x i1>
%res = select <16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2
ret <16 x i8> %res
}
define <4 x i32> @_Z9select_i1Dv4_iS_S_(<4 x i32> %cond.ext, <4 x i32> %arg1.ext, <4 x i32> %arg2.ext) {
entry:
%cond = trunc <4 x i32> %cond.ext to <4 x i1>
%arg1 = trunc <4 x i32> %arg1.ext to <4 x i1>
%arg2 = trunc <4 x i32> %arg2.ext to <4 x i1>
%res.trunc = select <4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2
%res = sext <4 x i1> %res.trunc to <4 x i32>
ret <4 x i32> %res
}
define <8 x i16> @_Z9select_i1Dv8_sS_S_(<8 x i16> %cond.ext, <8 x i16> %arg1.ext, <8 x i16> %arg2.ext) {
entry:
%cond = trunc <8 x i16> %cond.ext to <8 x i1>
%arg1 = trunc <8 x i16> %arg1.ext to <8 x i1>
%arg2 = trunc <8 x i16> %arg2.ext to <8 x i1>
%res.trunc = select <8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2
%res = sext <8 x i1> %res.trunc to <8 x i16>
ret <8 x i16> %res
}
define <16 x i8> @_Z9select_i1Dv16_aS_S_(<16 x i8> %cond.ext, <16 x i8> %arg1.ext, <16 x i8> %arg2.ext) {
entry:
%cond = trunc <16 x i8> %cond.ext to <16 x i1>
%arg1 = trunc <16 x i8> %arg1.ext to <16 x i1>
%arg2 = trunc <16 x i8> %arg2.ext to <16 x i1>
%res.trunc = select <16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2
%res = sext <16 x i1> %res.trunc to <16 x i8>
ret <16 x i8> %res
}
//===- subzero/crosstest/test_select_main.cpp - Driver for tests ----------===//
//
// The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Driver for crosstesting the select bitcode instruction
//
//===----------------------------------------------------------------------===//
/* crosstest.py --test=test_select.ll --driver=test_select_main.cpp \
--prefix=Subzero_ --output=test_select */
#include <cfloat>
#include <cstring>
#include <iostream>
#include "test_select.h"
namespace Subzero_ {
#include "test_select.h"
}
static const size_t MaxTestsPerFunc = 100000;
template <typename T, typename TI1>
void testSelect(size_t &TotalTests, size_t &Passes, size_t &Failures) {
typedef typename Vectors<T>::Ty Ty;
typedef typename Vectors<TI1>::Ty TyI1;
volatile unsigned Values[] = {
0x0, 0x1, 0x7ffffffe, 0x7fffffff,
0x80000000, 0x80000001, 0xfffffffe, 0xffffffff,
0x7e, 0x7f, 0x80, 0x81,
0xfe, 0xff, 0x100, 0x101,
0x7ffe, 0x7fff, 0x8000, 0x8001,
0xfffe, 0xffff, 0x10000, 0x10001
};
static const size_t NumValues = sizeof(Values) / sizeof(*Values);
static const size_t NumElements = Vectors<T>::NumElements;
PRNG Index;
for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
TyI1 Cond;
Ty Value1, Value2;
for (size_t j = 0; j < NumElements; ++j) {
Cond[j] = Index() % 2;
Value1[j] = Values[Index() % NumValues];
Value2[j] = Values[Index() % NumValues];
}
Ty ResultLlc = select(Cond, Value1, Value2);
Ty ResultSz = Subzero_::select(Cond, Value1, Value2);
++TotalTests;
if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
++Passes;
} else {
++Failures;
std::cout << "select<" << Vectors<T>::TypeName << ">(Cond=";
std::cout << vectAsString<TI1>(Cond)
<< ", Value1=" << vectAsString<T>(Value1)
<< ", Value2=" << vectAsString<T>(Value2) << ")\n";
std::cout << "llc=" << vectAsString<T>(ResultLlc) << "\n";
std::cout << "sz =" << vectAsString<T>(ResultSz) << "\n";
}
}
}
template<> void
testSelect<v4f32, v4i1>(size_t &TotalTests, size_t &Passes, size_t &Failures) {
static const float NegInf = -1.0 / 0.0;
static const float PosInf = 1.0 / 0.0;
static const float Nan = 0.0 / 0.0;
static const float NegNan = -0.0 / 0.0;
volatile float Values[] = {
0, 1, 0x7e,
0x7f, 0x80, 0x81,
0xfe, 0xff, 0x7ffe,
0x7fff, 0x8000, 0x8001,
0xfffe, 0xffff, 0x7ffffffe,
0x7fffffff, 0x80000000, 0x80000001,
0xfffffffe, 0xffffffff, 0x100000000ll,
0x100000001ll, 0x7ffffffffffffffell, 0x7fffffffffffffffll,
0x8000000000000000ll, 0x8000000000000001ll, 0xfffffffffffffffell,
0xffffffffffffffffll, NegInf, PosInf,
Nan, NegNan, -0.0,
FLT_MIN, FLT_MAX, DBL_MIN,
DBL_MAX
};
static const size_t NumValues = sizeof(Values) / sizeof(*Values);
static const size_t NumElements = 4;
PRNG Index;
for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
v4si32 Cond;
v4f32 Value1, Value2;
for (size_t j = 0; j < NumElements; ++j) {
Cond[j] = Index() % 2;
Value1[j] = Values[Index() % NumValues];
Value2[j] = Values[Index() % NumValues];
}
v4f32 ResultLlc = select(Cond, Value1, Value2);
v4f32 ResultSz = Subzero_::select(Cond, Value1, Value2);
++TotalTests;
if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
++Passes;
} else {
++Failures;
std::cout << "select<v4f32>(Cond=";
std::cout << vectAsString<v4i1>(Cond)
<< ", Value1=" << vectAsString<v4f32>(Value1)
<< ", Value2=" << vectAsString<v4f32>(Value2) << ")\n";
std::cout << "llc=" << vectAsString<v4f32>(ResultLlc) << "\n";
std::cout << "sz =" << vectAsString<v4f32>(ResultSz) << "\n";
}
}
}
template<typename T>
void testSelectI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
typedef typename Vectors<T>::Ty Ty;
static const size_t NumElements = Vectors<T>::NumElements;
PRNG Index;
for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
Ty Cond;
Ty Value1, Value2;
for (size_t j = 0; j < NumElements; ++j) {
Cond[j] = Index() % 2;
Value1[j] = Index() % 2;
Value2[j] = Index() % 2;
}
Ty ResultLlc = select_i1(Cond, Value1, Value2);
Ty ResultSz = Subzero_::select_i1(Cond, Value1, Value2);
++TotalTests;
if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
++Passes;
} else {
++Failures;
std::cout << "select<" << Vectors<T>::TypeName << ">(Cond=";
std::cout << vectAsString<T>(Cond)
<< ", Value1=" << vectAsString<T>(Value1)
<< ", Value2=" << vectAsString<T>(Value2) << ")\n";
std::cout << "llc=" << vectAsString<T>(ResultLlc) << "\n";
std::cout << "sz =" << vectAsString<T>(ResultSz) << "\n";
}
}
}
int main(int argc, char *argv[]) {
size_t TotalTests = 0;
size_t Passes = 0;
size_t Failures = 0;
testSelect<v4f32, v4i1>(TotalTests, Passes, Failures);
testSelect<v4si32, v4i1>(TotalTests, Passes, Failures);
testSelect<v4ui32, v4i1>(TotalTests, Passes, Failures);
testSelect<v8si16, v8i1>(TotalTests, Passes, Failures);
testSelect<v8ui16, v8i1>(TotalTests, Passes, Failures);
testSelect<v16si8, v16i1>(TotalTests, Passes, Failures);
testSelect<v16ui8, v16i1>(TotalTests, Passes, Failures);
testSelectI1<v4i1>(TotalTests, Passes, Failures);
testSelectI1<v8i1>(TotalTests, Passes, Failures);
testSelectI1<v16i1>(TotalTests, Passes, Failures);
std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
<< " Failures=" << Failures << "\n";
return Failures;
}
......@@ -376,7 +376,7 @@ InstRet::InstRet(Cfg *Func, Operand *RetValue)
InstSelect::InstSelect(Cfg *Func, Variable *Dest, Operand *Condition,
Operand *SourceTrue, Operand *SourceFalse)
: Inst(Func, Inst::Select, 3, Dest) {
assert(Condition->getType() == IceType_i1);
assert(typeElementType(Condition->getType()) == IceType_i1);
addSource(Condition);
addSource(SourceTrue);
addSource(SourceFalse);
......
......@@ -458,6 +458,7 @@ template <> const char *InstX8632Sbb::Opcode = "sbb";
template <> const char *InstX8632Psub::Opcode = "psub";
template <> const char *InstX8632And::Opcode = "and";
template <> const char *InstX8632Pand::Opcode = "pand";
template <> const char *InstX8632Pandn::Opcode = "pandn";
template <> const char *InstX8632Or::Opcode = "or";
template <> const char *InstX8632Por::Opcode = "por";
template <> const char *InstX8632Xor::Opcode = "xor";
......
......@@ -174,6 +174,7 @@ public:
Or,
Padd,
Pand,
Pandn,
Pcmpeq,
Pcmpgt,
Pextrw,
......@@ -564,6 +565,7 @@ typedef InstX8632Binop<InstX8632::Sbb> InstX8632Sbb;
typedef InstX8632Binop<InstX8632::Psub> InstX8632Psub;
typedef InstX8632Binop<InstX8632::And> InstX8632And;
typedef InstX8632Binop<InstX8632::Pand> InstX8632Pand;
typedef InstX8632Binop<InstX8632::Pandn> InstX8632Pandn;
typedef InstX8632Binop<InstX8632::Or> InstX8632Or;
typedef InstX8632Binop<InstX8632::Por> InstX8632Por;
typedef InstX8632Binop<InstX8632::Xor> InstX8632Xor;
......
......@@ -3410,11 +3410,46 @@ void TargetX8632::lowerRet(const InstRet *Inst) {
}
void TargetX8632::lowerSelect(const InstSelect *Inst) {
// a=d?b:c ==> cmp d,0; a=b; jne L1; FakeUse(a); a=c; L1:
Variable *Dest = Inst->getDest();
Operand *SrcT = Inst->getTrueOperand();
Operand *SrcF = Inst->getFalseOperand();
Operand *Condition = legalize(Inst->getCondition());
Operand *Condition = Inst->getCondition();
if (isVectorType(Dest->getType())) {
// a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d)
// TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has
// blendps and pblendw for constant condition operands.
Type SrcTy = SrcT->getType();
Variable *T = makeReg(SrcTy);
Variable *T2 = makeReg(SrcTy);
// Sign extend the condition operand if applicable.
if (SrcTy == IceType_v4f32) {
// The sext operation takes only integer arguments.
Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode());
lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
_movp(T, T3);
} else if (typeElementType(SrcTy) != IceType_i1) {
lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
} else {
_movp(T, Condition);
}
// ALIGNHACK: Until stack alignment support is implemented, the
// bitwise vector instructions need to have both operands in
// registers. Once there is support for stack alignment, LEGAL_HACK
// can be removed.
#define LEGAL_HACK(Vect) legalizeToVar((Vect))
_movp(T2, T);
_pand(T, LEGAL_HACK(SrcT));
_pandn(T2, LEGAL_HACK(SrcF));
_por(T, T2);
_movp(Dest, T);
#undef LEGAL_HACK
return;
}
// a=d?b:c ==> cmp d,0; a=b; jne L1; FakeUse(a); a=c; L1:
Operand *ConditionRMI = legalize(Condition);
Constant *Zero = Ctx->getConstantZero(IceType_i32);
InstX8632Label *Label = InstX8632Label::create(Func, this);
......@@ -3423,7 +3458,7 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Operand *SrcLoRI = legalize(loOperand(SrcT), Legal_Reg | Legal_Imm, true);
Operand *SrcHiRI = legalize(hiOperand(SrcT), Legal_Reg | Legal_Imm, true);
_cmp(Condition, Zero);
_cmp(ConditionRMI, Zero);
_mov(DestLo, SrcLoRI);
_mov(DestHi, SrcHiRI);
_br(InstX8632Br::Br_ne, Label);
......@@ -3436,7 +3471,7 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
_mov(DestLo, SrcLoRI);
_mov(DestHi, SrcHiRI);
} else {
_cmp(Condition, Zero);
_cmp(ConditionRMI, Zero);
SrcT = legalize(SrcT, Legal_Reg | Legal_Imm, true);
_mov(Dest, SrcT);
_br(InstX8632Br::Br_ne, Label);
......
......@@ -304,6 +304,9 @@ protected:
void _pand(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pand::create(Func, Dest, Src0));
}
void _pandn(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pandn::create(Func, Dest, Src0));
}
void _pcmpeq(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pcmpeq::create(Func, Dest, Src0));
}
......
; This file tests support for the select instruction with vector valued inputs.
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
; RUN: | FileCheck --check-prefix=DUMP %s
define <16 x i8> @test_select_v16i8(<16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2) {
entry:
%res = select <16 x i1> %cond, <16 x i8> %arg1, <16 x i8> %arg2
ret <16 x i8> %res
; CHECK-LABEL: test_select_v16i8:
; CHECK: pand
; CHECK: pandn
; CHECK: por
}
define <16 x i1> @test_select_v16i1(<16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2) {
entry:
%res = select <16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2
ret <16 x i1> %res
; CHECK-LABEL: test_select_v16i1:
; CHECK: pand
; CHECK: pandn
; CHECK: por
}
define <8 x i16> @test_select_v8i16(<8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2) {
entry:
%res = select <8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2
ret <8 x i16> %res
; CHECK-LABEL: test_select_v8i16:
; CHECK: pand
; CHECK: pandn
; CHECK: por
}
define <8 x i1> @test_select_v8i1(<8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2) {
entry:
%res = select <8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2
ret <8 x i1> %res
; CHECK-LABEL: test_select_v8i1:
; CHECK: pand
; CHECK: pandn
; CHECK: por
}
define <4 x i32> @test_select_v4i32(<4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2) {
entry:
%res = select <4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2
ret <4 x i32> %res
; CHECK-LABEL: test_select_v4i32:
; CHECK: pand
; CHECK: pandn
; CHECK: por
}
define <4 x float> @test_select_v4f32(<4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2) {
entry:
%res = select <4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2
ret <4 x float> %res
; CHECK-LABEL: test_select_v4f32:
; CHECK: pand
; CHECK: pandn
; CHECK: por
}
define <4 x i1> @test_select_v4i1(<4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2) {
entry:
%res = select <4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2
ret <4 x i1> %res
; CHECK-LABEL: test_select_v4i1:
; CHECK: pand
; CHECK: pandn
; CHECK: por
}
; ERRORS-NOT: ICE translation error
; DUMP-NOT: SZ
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment