Commit 0a450519 by Matt Wala

Subzero: Add support for SSE4.1 instructions.

* Add initial support for code generation with SSE4.1 instructions. The following operations are affected: - multiplication with v4i32 - select - insertelement - extractelement * Add appropriate lit checks for SSE4.1 instructions. Run the crosstests in both SSE2 and SSE4.1 mode. * Introduce the -mattr flag to llvm2ice to control which instruction set gets used. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/427843002
parent e6e497db
...@@ -43,6 +43,9 @@ if __name__ == '__main__': ...@@ -43,6 +43,9 @@ if __name__ == '__main__':
metavar='OPTLEVEL', metavar='OPTLEVEL',
help='Optimization level ' + help='Optimization level ' +
'(m1 and -1 are equivalent)') '(m1 and -1 are equivalent)')
argparser.add_argument('--mattr', required=False, default='sse2',
dest='attr', choices=['sse2', 'sse4.1'],
metavar='ATTRIBUTE', help='Target attribute')
argparser.add_argument('--prefix', required=True, argparser.add_argument('--prefix', required=True,
metavar='SZ_PREFIX', metavar='SZ_PREFIX',
help='String prepended to Subzero symbol names') help='String prepended to Subzero symbol names')
...@@ -93,6 +96,7 @@ if __name__ == '__main__': ...@@ -93,6 +96,7 @@ if __name__ == '__main__':
obj_llc = os.path.join(args.dir, base + '.llc.o') obj_llc = os.path.join(args.dir, base + '.llc.o')
shellcmd(['../llvm2ice', shellcmd(['../llvm2ice',
'-O' + args.optlevel, '-O' + args.optlevel,
'-mattr=' + args.attr,
'--target=' + args.target, '--target=' + args.target,
'--prefix=' + args.prefix, '--prefix=' + args.prefix,
'-o=' + asm_sz, '-o=' + asm_sz,
......
...@@ -6,116 +6,144 @@ ...@@ -6,116 +6,144 @@
set -eux set -eux
OPTLEVELS="m1 2" OPTLEVELS="m1 2"
ATTRIBUTES="sse2 sse4.1"
OUTDIR=Output OUTDIR=Output
# Clean the output directory to avoid reusing stale results. # Clean the output directory to avoid reusing stale results.
rm -rf "${OUTDIR}" rm -rf "${OUTDIR}"
mkdir -p "${OUTDIR}" mkdir -p "${OUTDIR}"
for optlevel in ${OPTLEVELS} ; do for optlevel in ${OPTLEVELS} ; do
for attribute in ${ATTRIBUTES} ; do
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
--dir="${OUTDIR}" \ ./crosstest.py -O${optlevel} --mattr ${attribute} \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --prefix=Subzero_ \
--test=simple_loop.c \ --target=x8632 \
--driver=simple_loop_main.c \ --dir="${OUTDIR}" \
--output=simple_loop_O${optlevel} --llvm-bin-path="${LLVM_BIN_PATH}" \
--test=simple_loop.c \
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \ --driver=simple_loop_main.c \
--dir="${OUTDIR}" \ --output=simple_loop_O${optlevel}_${attribute}
--llvm-bin-path="${LLVM_BIN_PATH}" \
--test=mem_intrin.cpp \ ./crosstest.py -O${optlevel} --mattr ${attribute} \
--driver=mem_intrin_main.cpp \ --prefix=Subzero_ \
--output=mem_intrin_O${optlevel} --target=x8632 \
--dir="${OUTDIR}" \
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \ --llvm-bin-path="${LLVM_BIN_PATH}" \
--dir="${OUTDIR}" \ --test=mem_intrin.cpp \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --driver=mem_intrin_main.cpp \
--test=test_arith.cpp \ --output=mem_intrin_O${optlevel}_${attribute}
--test=test_arith_frem.ll \
--test=test_arith_sqrt.ll \ ./crosstest.py -O${optlevel} --mattr ${attribute} \
--driver=test_arith_main.cpp \ --prefix=Subzero_ \
--output=test_arith_O${optlevel} --target=x8632 \
--dir="${OUTDIR}" \
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \ --llvm-bin-path="${LLVM_BIN_PATH}" \
--dir="${OUTDIR}" \ --test=test_arith.cpp \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --test=test_arith_frem.ll \
--test=test_bitmanip.cpp --test=test_bitmanip_intrin.ll \ --test=test_arith_sqrt.ll \
--driver=test_bitmanip_main.cpp \ --driver=test_arith_main.cpp \
--output=test_bitmanip_O${optlevel} --output=test_arith_O${optlevel}_${attribute}
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \ ./crosstest.py -O${optlevel} --mattr ${attribute} \
--dir="${OUTDIR}" \ --prefix=Subzero_ \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --target=x8632 \
--test=test_cast.cpp --test=test_cast_to_u1.ll \ --dir="${OUTDIR}" \
--driver=test_cast_main.cpp \ --llvm-bin-path="${LLVM_BIN_PATH}" \
--output=test_cast_O${optlevel} --test=test_bitmanip.cpp --test=test_bitmanip_intrin.ll \
--driver=test_bitmanip_main.cpp \
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \ --output=test_bitmanip_O${optlevel}_${attribute}
--dir="${OUTDIR}" \
--llvm-bin-path="${LLVM_BIN_PATH}" \ ./crosstest.py -O${optlevel} --mattr ${attribute} \
--test=test_fcmp.pnacl.ll \ --prefix=Subzero_ \
--driver=test_fcmp_main.cpp \ --target=x8632 \
--output=test_fcmp_O${optlevel} --dir="${OUTDIR}" \
--llvm-bin-path="${LLVM_BIN_PATH}" \
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \ --test=test_cast.cpp --test=test_cast_to_u1.ll \
--dir="${OUTDIR}" \ --driver=test_cast_main.cpp \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --output=test_cast_O${optlevel}_${attribute}
--test=test_global.cpp \
--driver=test_global_main.cpp \ ./crosstest.py -O${optlevel} --mattr ${attribute} \
--output=test_global_O${optlevel} --prefix=Subzero_ \
--target=x8632 \
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \ --dir="${OUTDIR}" \
--dir="${OUTDIR}" \ --llvm-bin-path="${LLVM_BIN_PATH}" \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --test=test_fcmp.pnacl.ll \
--test=test_icmp.cpp --test=test_icmp_i1vec.ll \ --driver=test_fcmp_main.cpp \
--driver=test_icmp_main.cpp \ --output=test_fcmp_O${optlevel}_${attribute}
--output=test_icmp_O${optlevel}
./crosstest.py -O${optlevel} --mattr ${attribute} \
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \ --prefix=Subzero_ \
--dir="${OUTDIR}" \ --target=x8632 \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --dir="${OUTDIR}" \
--test=test_select.ll \ --llvm-bin-path="${LLVM_BIN_PATH}" \
--driver=test_select_main.cpp \ --test=test_global.cpp \
--output=test_select_O${optlevel} --driver=test_global_main.cpp \
--output=test_global_O${optlevel}_${attribute}
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
--dir="${OUTDIR}" \ ./crosstest.py -O${optlevel} --mattr ${attribute} \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --prefix=Subzero_ \
--test=test_stacksave.c \ --target=x8632 \
--driver=test_stacksave_main.c \ --dir="${OUTDIR}" \
--output=test_stacksave_O${optlevel} --llvm-bin-path="${LLVM_BIN_PATH}" \
--test=test_icmp.cpp --test=test_icmp_i1vec.ll \
# Compile the non-subzero object files straight from source --driver=test_icmp_main.cpp \
# since the native LLVM backend does not understand how to --output=test_icmp_O${optlevel}_${attribute}
# lower NaCl-specific intrinsics.
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \ ./crosstest.py -O${optlevel} --mattr ${attribute} \
--dir="${OUTDIR}" \ --prefix=Subzero_ \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --target=x8632 \
--test=test_sync_atomic.cpp \ --dir="${OUTDIR}" \
--crosstest-bitcode=0 \ --llvm-bin-path="${LLVM_BIN_PATH}" \
--driver=test_sync_atomic_main.cpp \ --test=test_select.ll \
--output=test_sync_atomic_O${optlevel} --driver=test_select_main.cpp \
--output=test_select_O${optlevel}_${attribute}
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
--dir="${OUTDIR}" \ ./crosstest.py -O${optlevel} --mattr ${attribute} \
--llvm-bin-path="${LLVM_BIN_PATH}" \ --prefix=Subzero_ \
--test=test_vector_ops.ll \ --target=x8632 \
--driver=test_vector_ops_main.cpp \ --dir="${OUTDIR}" \
--output=test_vector_ops_O${optlevel} --llvm-bin-path="${LLVM_BIN_PATH}" \
--test=test_stacksave.c \
--driver=test_stacksave_main.c \
--output=test_stacksave_O${optlevel}_${attribute}
# Compile the non-subzero object files straight from source
# since the native LLVM backend does not understand how to
# lower NaCl-specific intrinsics.
./crosstest.py -O${optlevel} --mattr ${attribute} \
--prefix=Subzero_ \
--target=x8632 \
--dir="${OUTDIR}" \
--llvm-bin-path="${LLVM_BIN_PATH}" \
--test=test_sync_atomic.cpp \
--crosstest-bitcode=0 \
--driver=test_sync_atomic_main.cpp \
--output=test_sync_atomic_O${optlevel}_${attribute}
./crosstest.py -O${optlevel} --mattr ${attribute} \
--prefix=Subzero_ --target=x8632 \
--dir="${OUTDIR}" \
--llvm-bin-path="${LLVM_BIN_PATH}" \
--test=test_vector_ops.ll \
--driver=test_vector_ops_main.cpp \
--output=test_vector_ops_O${optlevel}_${attribute}
done
done done
for optlevel in ${OPTLEVELS} ; do for optlevel in ${OPTLEVELS} ; do
"${OUTDIR}"/simple_loop_O${optlevel} for attribute in ${ATTRIBUTES}; do
"${OUTDIR}"/mem_intrin_O${optlevel} "${OUTDIR}"/simple_loop_O${optlevel}_${attribute}
"${OUTDIR}"/test_arith_O${optlevel} "${OUTDIR}"/mem_intrin_O${optlevel}_${attribute}
"${OUTDIR}"/test_bitmanip_O${optlevel} "${OUTDIR}"/test_arith_O${optlevel}_${attribute}
"${OUTDIR}"/test_cast_O${optlevel} "${OUTDIR}"/test_bitmanip_O${optlevel}_${attribute}
"${OUTDIR}"/test_fcmp_O${optlevel} "${OUTDIR}"/test_cast_O${optlevel}_${attribute}
"${OUTDIR}"/test_global_O${optlevel} "${OUTDIR}"/test_fcmp_O${optlevel}_${attribute}
"${OUTDIR}"/test_icmp_O${optlevel} "${OUTDIR}"/test_global_O${optlevel}_${attribute}
"${OUTDIR}"/test_select_O${optlevel} "${OUTDIR}"/test_icmp_O${optlevel}_${attribute}
"${OUTDIR}"/test_stacksave_O${optlevel} "${OUTDIR}"/test_select_O${optlevel}_${attribute}
"${OUTDIR}"/test_sync_atomic_O${optlevel} "${OUTDIR}"/test_stacksave_O${optlevel}_${attribute}
"${OUTDIR}"/test_vector_ops_O${optlevel} "${OUTDIR}"/test_sync_atomic_O${optlevel}_${attribute}
"${OUTDIR}"/test_vector_ops_O${optlevel}_${attribute}
done
done done
...@@ -484,7 +484,7 @@ template <> const char *InstX8632Pxor::Opcode = "pxor"; ...@@ -484,7 +484,7 @@ template <> const char *InstX8632Pxor::Opcode = "pxor";
template <> const char *InstX8632Imul::Opcode = "imul"; template <> const char *InstX8632Imul::Opcode = "imul";
template <> const char *InstX8632Mulps::Opcode = "mulps"; template <> const char *InstX8632Mulps::Opcode = "mulps";
template <> const char *InstX8632Mulss::Opcode = "mulss"; template <> const char *InstX8632Mulss::Opcode = "mulss";
template <> const char *InstX8632Pmullw::Opcode = "pmullw"; template <> const char *InstX8632Pmull::Opcode = "pmull";
template <> const char *InstX8632Pmuludq::Opcode = "pmuludq"; template <> const char *InstX8632Pmuludq::Opcode = "pmuludq";
template <> const char *InstX8632Div::Opcode = "div"; template <> const char *InstX8632Div::Opcode = "div";
template <> const char *InstX8632Divps::Opcode = "divps"; template <> const char *InstX8632Divps::Opcode = "divps";
...@@ -500,10 +500,13 @@ template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq"; ...@@ -500,10 +500,13 @@ template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt"; template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
template <> const char *InstX8632Movss::Opcode = "movss"; template <> const char *InstX8632Movss::Opcode = "movss";
// Ternary ops // Ternary ops
template <> const char *InstX8632Insertps::Opcode = "insertps";
template <> const char *InstX8632Shufps::Opcode = "shufps"; template <> const char *InstX8632Shufps::Opcode = "shufps";
template <> const char *InstX8632Pinsrw::Opcode = "pinsrw"; template <> const char *InstX8632Pinsr::Opcode = "pinsr";
template <> const char *InstX8632Blendvps::Opcode = "blendvps";
template <> const char *InstX8632Pblendvb::Opcode = "pblendvb";
// Three address ops // Three address ops
template <> const char *InstX8632Pextrw::Opcode = "pextrw"; template <> const char *InstX8632Pextr::Opcode = "pextr";
template <> const char *InstX8632Pshufd::Opcode = "pshufd"; template <> const char *InstX8632Pshufd::Opcode = "pshufd";
template <> void InstX8632Sqrtss::emit(const Cfg *Func) const { template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
...@@ -532,6 +535,23 @@ template <> void InstX8632Padd::emit(const Cfg *Func) const { ...@@ -532,6 +535,23 @@ template <> void InstX8632Padd::emit(const Cfg *Func) const {
emitTwoAddress(buf, this, Func); emitTwoAddress(buf, this, Func);
} }
template <> void InstX8632Pmull::emit(const Cfg *Func) const {
char buf[30];
bool TypesAreValid = getDest()->getType() == IceType_v4i32 ||
getDest()->getType() == IceType_v8i16;
bool InstructionSetIsValid =
getDest()->getType() == IceType_v8i16 ||
static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
TargetX8632::SSE4_1;
(void)TypesAreValid;
(void)InstructionSetIsValid;
assert(TypesAreValid);
assert(InstructionSetIsValid);
snprintf(buf, llvm::array_lengthof(buf), "pmull%s",
TypeX8632Attributes[getDest()->getType()].PackString);
emitTwoAddress(buf, this, Func);
}
template <> void InstX8632Subss::emit(const Cfg *Func) const { template <> void InstX8632Subss::emit(const Cfg *Func) const {
char buf[30]; char buf[30];
snprintf(buf, llvm::array_lengthof(buf), "sub%s", snprintf(buf, llvm::array_lengthof(buf), "sub%s",
...@@ -553,12 +573,6 @@ template <> void InstX8632Mulss::emit(const Cfg *Func) const { ...@@ -553,12 +573,6 @@ template <> void InstX8632Mulss::emit(const Cfg *Func) const {
emitTwoAddress(buf, this, Func); emitTwoAddress(buf, this, Func);
} }
template <> void InstX8632Pmullw::emit(const Cfg *Func) const {
assert(getSrc(0)->getType() == IceType_v8i16 &&
getSrc(1)->getType() == IceType_v8i16);
emitTwoAddress(Opcode, this, Func);
}
template <> void InstX8632Pmuludq::emit(const Cfg *Func) const { template <> void InstX8632Pmuludq::emit(const Cfg *Func) const {
assert(getSrc(0)->getType() == IceType_v4i32 && assert(getSrc(0)->getType() == IceType_v4i32 &&
getSrc(1)->getType() == IceType_v4i32); getSrc(1)->getType() == IceType_v4i32);
...@@ -588,6 +602,38 @@ template <> void InstX8632Idiv::emit(const Cfg *Func) const { ...@@ -588,6 +602,38 @@ template <> void InstX8632Idiv::emit(const Cfg *Func) const {
Str << "\n"; Str << "\n";
} }
namespace {
// pblendvb and blendvps take xmm0 as a final implicit argument.
void emitVariableBlendInst(const char *Opcode, const Inst *Inst,
const Cfg *Func) {
Ostream &Str = Func->getContext()->getStrEmit();
assert(Inst->getSrcSize() == 3);
assert(llvm::isa<Variable>(Inst->getSrc(2)));
assert(llvm::cast<Variable>(Inst->getSrc(2))->getRegNum() ==
TargetX8632::Reg_xmm0);
Str << "\t" << Opcode << "\t";
Inst->getDest()->emit(Func);
Str << ", ";
Inst->getSrc(1)->emit(Func);
Str << "\n";
}
} // end anonymous namespace
template <> void InstX8632Blendvps::emit(const Cfg *Func) const {
assert(static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
TargetX8632::SSE4_1);
emitVariableBlendInst(Opcode, this, Func);
}
template <> void InstX8632Pblendvb::emit(const Cfg *Func) const {
assert(static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet() >=
TargetX8632::SSE4_1);
emitVariableBlendInst(Opcode, this, Func);
}
template <> void InstX8632Imul::emit(const Cfg *Func) const { template <> void InstX8632Imul::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit(); Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 2); assert(getSrcSize() == 2);
...@@ -1127,13 +1173,19 @@ template <> void InstX8632Pcmpgt::emit(const Cfg *Func) const { ...@@ -1127,13 +1173,19 @@ template <> void InstX8632Pcmpgt::emit(const Cfg *Func) const {
emitTwoAddress(buf, this, Func); emitTwoAddress(buf, this, Func);
} }
template <> void InstX8632Pextrw::emit(const Cfg *Func) const { template <> void InstX8632Pextr::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit(); Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 2); assert(getSrcSize() == 2);
Str << "\t" << Opcode << "\t"; // pextrb and pextrd are SSE4.1 instructions.
assert(getSrc(0)->getType() == IceType_v8i16 ||
getSrc(0)->getType() == IceType_v8i1 ||
static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet()
>= TargetX8632::SSE4_1);
Str << "\t" << Opcode
<< TypeX8632Attributes[getSrc(0)->getType()].PackString << "\t";
Variable *Dest = getDest(); Variable *Dest = getDest();
assert(Dest->hasReg() && Dest->getType() == IceType_i16); // pextrw must take a register dest.
// pextrw takes r32 dest. assert(Dest->getType() != IceType_i16 || Dest->hasReg());
Dest->asType(IceType_i32).emit(Func); Dest->asType(IceType_i32).emit(Func);
Str << ", "; Str << ", ";
getSrc(0)->emit(Func); getSrc(0)->emit(Func);
...@@ -1142,16 +1194,26 @@ template <> void InstX8632Pextrw::emit(const Cfg *Func) const { ...@@ -1142,16 +1194,26 @@ template <> void InstX8632Pextrw::emit(const Cfg *Func) const {
Str << "\n"; Str << "\n";
} }
template <> void InstX8632Pinsrw::emit(const Cfg *Func) const { template <> void InstX8632Pinsr::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit(); Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 3); assert(getSrcSize() == 3);
Str << "\t" << Opcode << "\t"; // pinsrb and pinsrd are SSE4.1 instructions.
assert(getDest()->getType() == IceType_v8i16 ||
getDest()->getType() == IceType_v8i1 ||
static_cast<TargetX8632 *>(Func->getTarget())->getInstructionSet()
>= TargetX8632::SSE4_1);
Str << "\t" << Opcode
<< TypeX8632Attributes[getDest()->getType()].PackString << "\t";
getDest()->emit(Func); getDest()->emit(Func);
Str << ", "; Str << ", ";
Operand *Src1 = getSrc(1); Operand *Src1 = getSrc(1);
if (Variable *VSrc1 = llvm::dyn_cast<Variable>(Src1)) { if (Variable *VSrc1 = llvm::dyn_cast<Variable>(Src1)) {
// If src1 is a register, it should be r32. // If src1 is a register, it should always be r32.
VSrc1->asType(VSrc1->hasReg() ? IceType_i32 : IceType_i16).emit(Func); if (VSrc1->hasReg()) {
VSrc1->asType(IceType_i32).emit(Func);
} else {
VSrc1->emit(Func);
}
} else { } else {
Src1->emit(Func); Src1->emit(Func);
} }
...@@ -1216,7 +1278,9 @@ void InstX8632Push::dump(const Cfg *Func) const { ...@@ -1216,7 +1278,9 @@ void InstX8632Push::dump(const Cfg *Func) const {
template <> void InstX8632Psll::emit(const Cfg *Func) const { template <> void InstX8632Psll::emit(const Cfg *Func) const {
assert(getDest()->getType() == IceType_v8i16 || assert(getDest()->getType() == IceType_v8i16 ||
getDest()->getType() == IceType_v4i32); getDest()->getType() == IceType_v8i1 ||
getDest()->getType() == IceType_v4i32 ||
getDest()->getType() == IceType_v4i1);
char buf[30]; char buf[30];
snprintf(buf, llvm::array_lengthof(buf), "psll%s", snprintf(buf, llvm::array_lengthof(buf), "psll%s",
TypeX8632Attributes[getDest()->getType()].PackString); TypeX8632Attributes[getDest()->getType()].PackString);
...@@ -1225,7 +1289,9 @@ template <> void InstX8632Psll::emit(const Cfg *Func) const { ...@@ -1225,7 +1289,9 @@ template <> void InstX8632Psll::emit(const Cfg *Func) const {
template <> void InstX8632Psra::emit(const Cfg *Func) const { template <> void InstX8632Psra::emit(const Cfg *Func) const {
assert(getDest()->getType() == IceType_v8i16 || assert(getDest()->getType() == IceType_v8i16 ||
getDest()->getType() == IceType_v4i32); getDest()->getType() == IceType_v8i1 ||
getDest()->getType() == IceType_v4i32 ||
getDest()->getType() == IceType_v4i1);
char buf[30]; char buf[30];
snprintf(buf, llvm::array_lengthof(buf), "psra%s", snprintf(buf, llvm::array_lengthof(buf), "psra%s",
TypeX8632Attributes[getDest()->getType()].PackString); TypeX8632Attributes[getDest()->getType()].PackString);
......
...@@ -88,9 +88,9 @@ ...@@ -88,9 +88,9 @@
X(IceType_i64, IceType_void, "si", "" , "" , "qword ptr") \ X(IceType_i64, IceType_void, "si", "" , "" , "qword ptr") \
X(IceType_f32, IceType_void, "ss", "ss", "" , "dword ptr") \ X(IceType_f32, IceType_void, "ss", "ss", "" , "dword ptr") \
X(IceType_f64, IceType_void, "sd", "sd", "" , "qword ptr") \ X(IceType_f64, IceType_void, "sd", "sd", "" , "qword ptr") \
X(IceType_v4i1, IceType_i32 , "?" , "" , "" , "xmmword ptr") \ X(IceType_v4i1, IceType_i32 , "?" , "" , "d", "xmmword ptr") \
X(IceType_v8i1, IceType_i16 , "?" , "" , "" , "xmmword ptr") \ X(IceType_v8i1, IceType_i16 , "?" , "" , "w", "xmmword ptr") \
X(IceType_v16i1, IceType_i8 , "?" , "" , "" , "xmmword ptr") \ X(IceType_v16i1, IceType_i8 , "?" , "" , "b", "xmmword ptr") \
X(IceType_v16i8, IceType_i8 , "?" , "" , "b", "xmmword ptr") \ X(IceType_v16i8, IceType_i8 , "?" , "" , "b", "xmmword ptr") \
X(IceType_v8i16, IceType_i16 , "?" , "" , "w", "xmmword ptr") \ X(IceType_v8i16, IceType_i16 , "?" , "" , "w", "xmmword ptr") \
X(IceType_v4i32, IceType_i32 , "dq", "" , "d", "xmmword ptr") \ X(IceType_v4i32, IceType_i32 , "dq", "" , "d", "xmmword ptr") \
......
...@@ -138,6 +138,7 @@ public: ...@@ -138,6 +138,7 @@ public:
Addps, Addps,
Addss, Addss,
And, And,
Blendvps,
Br, Br,
Bsf, Bsf,
Bsr, Bsr,
...@@ -157,6 +158,7 @@ public: ...@@ -157,6 +158,7 @@ public:
Icmp, Icmp,
Idiv, Idiv,
Imul, Imul,
Insertps,
Label, Label,
Lea, Lea,
Load, Load,
...@@ -176,11 +178,12 @@ public: ...@@ -176,11 +178,12 @@ public:
Padd, Padd,
Pand, Pand,
Pandn, Pandn,
Pblendvb,
Pcmpeq, Pcmpeq,
Pcmpgt, Pcmpgt,
Pextrw, Pextr,
Pinsrw, Pinsr,
Pmullw, Pmull,
Pmuludq, Pmuludq,
Pop, Pop,
Por, Por,
...@@ -573,7 +576,7 @@ typedef InstX8632Binop<InstX8632::Pxor> InstX8632Pxor; ...@@ -573,7 +576,7 @@ typedef InstX8632Binop<InstX8632::Pxor> InstX8632Pxor;
typedef InstX8632Binop<InstX8632::Imul> InstX8632Imul; typedef InstX8632Binop<InstX8632::Imul> InstX8632Imul;
typedef InstX8632Binop<InstX8632::Mulps> InstX8632Mulps; typedef InstX8632Binop<InstX8632::Mulps> InstX8632Mulps;
typedef InstX8632Binop<InstX8632::Mulss> InstX8632Mulss; typedef InstX8632Binop<InstX8632::Mulss> InstX8632Mulss;
typedef InstX8632Binop<InstX8632::Pmullw> InstX8632Pmullw; typedef InstX8632Binop<InstX8632::Pmull> InstX8632Pmull;
typedef InstX8632Binop<InstX8632::Pmuludq> InstX8632Pmuludq; typedef InstX8632Binop<InstX8632::Pmuludq> InstX8632Pmuludq;
typedef InstX8632Binop<InstX8632::Divps> InstX8632Divps; typedef InstX8632Binop<InstX8632::Divps> InstX8632Divps;
typedef InstX8632Binop<InstX8632::Divss> InstX8632Divss; typedef InstX8632Binop<InstX8632::Divss> InstX8632Divss;
...@@ -594,9 +597,12 @@ typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt; ...@@ -594,9 +597,12 @@ typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt;
typedef InstX8632Binop<InstX8632::Movss> InstX8632Movss; typedef InstX8632Binop<InstX8632::Movss> InstX8632Movss;
typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv; typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
typedef InstX8632Ternop<InstX8632::Div> InstX8632Div; typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
typedef InstX8632Ternop<InstX8632::Pinsrw> InstX8632Pinsrw; typedef InstX8632Ternop<InstX8632::Insertps> InstX8632Insertps;
typedef InstX8632Ternop<InstX8632::Pinsr> InstX8632Pinsr;
typedef InstX8632Ternop<InstX8632::Shufps> InstX8632Shufps; typedef InstX8632Ternop<InstX8632::Shufps> InstX8632Shufps;
typedef InstX8632ThreeAddressop<InstX8632::Pextrw> InstX8632Pextrw; typedef InstX8632Ternop<InstX8632::Blendvps> InstX8632Blendvps;
typedef InstX8632Ternop<InstX8632::Pblendvb> InstX8632Pblendvb;
typedef InstX8632ThreeAddressop<InstX8632::Pextr> InstX8632Pextr;
typedef InstX8632ThreeAddressop<InstX8632::Pshufd> InstX8632Pshufd; typedef InstX8632ThreeAddressop<InstX8632::Pshufd> InstX8632Pshufd;
// Base class for a lockable x86-32 instruction (emits a locked prefix). // Base class for a lockable x86-32 instruction (emits a locked prefix).
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "IceOperand.h" #include "IceOperand.h"
#include "IceTargetLoweringX8632.def" #include "IceTargetLoweringX8632.def"
#include "IceTargetLoweringX8632.h" #include "IceTargetLoweringX8632.h"
#include "llvm/Support/CommandLine.h"
namespace Ice { namespace Ice {
...@@ -123,6 +124,17 @@ const unsigned X86_MAX_XMM_ARGS = 4; ...@@ -123,6 +124,17 @@ const unsigned X86_MAX_XMM_ARGS = 4;
// The number of bits in a byte // The number of bits in a byte
const unsigned X86_CHAR_BIT = 8; const unsigned X86_CHAR_BIT = 8;
// Instruction set options
namespace cl = ::llvm::cl;
cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet(
"mattr", cl::desc("X86 target attributes"),
cl::init(TargetX8632::SSE2),
cl::values(
clEnumValN(TargetX8632::SSE2, "sse2",
"Enable SSE2 instructions (default)"),
clEnumValN(TargetX8632::SSE4_1, "sse4.1",
"Enable SSE 4.1 instructions"), clEnumValEnd));
// Return a string representation of the type that is suitable for use // Return a string representation of the type that is suitable for use
// in an identifier. // in an identifier.
IceString typeIdentString(const Type Ty) { IceString typeIdentString(const Type Ty) {
...@@ -234,8 +246,9 @@ void __attribute__((unused)) xMacroIntegrityCheck() { ...@@ -234,8 +246,9 @@ void __attribute__((unused)) xMacroIntegrityCheck() {
} // end of anonymous namespace } // end of anonymous namespace
TargetX8632::TargetX8632(Cfg *Func) TargetX8632::TargetX8632(Cfg *Func)
: TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0), : TargetLowering(Func), InstructionSet(CLInstructionSet),
LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false), IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0),
NextLabelNumber(0), ComputedLiveRanges(false),
PhysicalRegisters(VarList(Reg_NUM)) { PhysicalRegisters(VarList(Reg_NUM)) {
// TODO: Don't initialize IntegerRegisters and friends every time. // TODO: Don't initialize IntegerRegisters and friends every time.
// Instead, initialize in some sort of static initializer for the // Instead, initialize in some sort of static initializer for the
...@@ -1228,7 +1241,16 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { ...@@ -1228,7 +1241,16 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
_movp(Dest, T); _movp(Dest, T);
} break; } break;
case InstArithmetic::Mul: { case InstArithmetic::Mul: {
if (Dest->getType() == IceType_v4i32) { bool TypesAreValidForPmull =
Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
bool InstructionSetIsValidForPmull =
Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1;
if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_pmull(T, legalizeToVar(Src1));
_movp(Dest, T);
} else if (Dest->getType() == IceType_v4i32) {
// Lowering sequence: // Lowering sequence:
// Note: The mask arguments have index 0 on the left. // Note: The mask arguments have index 0 on the left.
// //
...@@ -1243,8 +1265,6 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { ...@@ -1243,8 +1265,6 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
// shufps T1, T2, {0,2,0,2} // shufps T1, T2, {0,2,0,2}
// pshufd T4, T1, {0,2,1,3} // pshufd T4, T1, {0,2,1,3}
// movups Dest, T4 // movups Dest, T4
//
// TODO(wala): SSE4.1 has pmulld.
// Mask that directs pshufd to create a vector with entries // Mask that directs pshufd to create a vector with entries
// Src[1, 0, 3, 0] // Src[1, 0, 3, 0]
...@@ -1273,11 +1293,6 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { ...@@ -1273,11 +1293,6 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
_shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));
_pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));
_movp(Dest, T4); _movp(Dest, T4);
} else if (Dest->getType() == IceType_v8i16) {
Variable *T = makeReg(IceType_v8i16);
_movp(T, Src0);
_pmullw(T, legalizeToVar(Src1));
_movp(Dest, T);
} else { } else {
assert(Dest->getType() == IceType_v16i8); assert(Dest->getType() == IceType_v16i8);
// Sz_mul_v16i8 // Sz_mul_v16i8
...@@ -2155,10 +2170,15 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) { ...@@ -2155,10 +2170,15 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
Variable *ExtractedElement = makeReg(InVectorElementTy); Variable *ExtractedElement = makeReg(InVectorElementTy);
// TODO(wala): Determine the best lowering sequences for each type. // TODO(wala): Determine the best lowering sequences for each type.
if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { bool CanUsePextr =
// Lower extractelement operations where the element is 32 bits Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1;
// wide with pshufd. if (CanUsePextr && Ty != IceType_v4f32) {
// TODO(wala): SSE4.1 has extractps and pextrd // Use pextrb, pextrw, or pextrd.
Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
Variable *SourceVectR = legalizeToVar(SourceVectOperand);
_pextr(ExtractedElement, SourceVectR, Mask);
} else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
// Use pshufd and movd/movss.
// //
// ALIGNHACK: Force vector operands to registers in instructions that // ALIGNHACK: Force vector operands to registers in instructions that
// require aligned memory operands until support for stack alignment // require aligned memory operands until support for stack alignment
...@@ -2187,13 +2207,9 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) { ...@@ -2187,13 +2207,9 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
_movss(ExtractedElement, T); _movss(ExtractedElement, T);
} }
#undef ALIGN_HACK #undef ALIGN_HACK
} else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
_pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask);
} else { } else {
assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
// Spill the value to a stack slot and do the extraction in memory. // Spill the value to a stack slot and do the extraction in memory.
// TODO(wala): SSE4.1 has pextrb.
// //
// TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
// support for legalizing to mem is implemented. // support for legalizing to mem is implemented.
...@@ -2539,10 +2555,18 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) { ...@@ -2539,10 +2555,18 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
ElementToInsert = Expanded; ElementToInsert = Expanded;
} }
if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
// Lower insertelement with 32-bit wide elements using shufps or // Use insertps, pinsrb, pinsrw, or pinsrd.
// movss. Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
// TODO(wala): SSE4.1 has pinsrd and insertps. Variable *T = makeReg(Ty);
_movp(T, SourceVectOperand);
if (Ty == IceType_v4f32)
_insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4));
else
_pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index));
_movp(Inst->getDest(), T);
} else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
// Use shufps or movss.
Variable *Element = NULL; Variable *Element = NULL;
if (InVectorElementTy == IceType_f32) { if (InVectorElementTy == IceType_f32) {
// Element will be in an XMM register since it is floating point. // Element will be in an XMM register since it is floating point.
...@@ -2607,17 +2631,10 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) { ...@@ -2607,17 +2631,10 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
_movp(Inst->getDest(), T); _movp(Inst->getDest(), T);
} }
#undef ALIGN_HACK #undef ALIGN_HACK
} else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
Variable *T = makeReg(Ty);
_movp(T, SourceVectOperand);
_pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index));
_movp(Inst->getDest(), T);
} else { } else {
assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
// Spill the value to a stack slot and perform the insertion in // Spill the value to a stack slot and perform the insertion in
// memory. // memory.
// TODO(wala): SSE4.1 has pinsrb.
// //
// TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
// support for legalizing to mem is implemented. // support for legalizing to mem is implemented.
...@@ -3551,11 +3568,42 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) { ...@@ -3551,11 +3568,42 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
Operand *Condition = Inst->getCondition(); Operand *Condition = Inst->getCondition();
if (isVectorType(Dest->getType())) { if (isVectorType(Dest->getType())) {
// a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d)
// TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has
// blendps and pblendw for constant condition operands.
Type SrcTy = SrcT->getType(); Type SrcTy = SrcT->getType();
Variable *T = makeReg(SrcTy); Variable *T = makeReg(SrcTy);
// ALIGNHACK: Until stack alignment support is implemented, vector
// instructions need to have vector operands in registers. Once
// there is support for stack alignment, LEGAL_HACK can be removed.
#define LEGAL_HACK(Vect) legalizeToVar((Vect))
if (InstructionSet >= SSE4_1) {
// TODO(wala): If the condition operand is a constant, use blendps
// or pblendw.
//
// Use blendvps or pblendvb to implement select.
if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
SrcTy == IceType_v4f32) {
Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0);
_movp(xmm0, Condition);
_psll(xmm0, Ctx->getConstantInt(IceType_i8, 31));
_movp(T, SrcF);
_blendvps(T, LEGAL_HACK(SrcT), xmm0);
_movp(Dest, T);
} else {
assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16
: IceType_v16i8;
Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0);
lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
_movp(T, SrcF);
_pblendvb(T, LEGAL_HACK(SrcT), xmm0);
_movp(Dest, T);
}
return;
}
// Lower select without SSE4.1:
// a=d?b:c ==>
// if elementtype(d) != i1:
// d=sext(d);
// a=(b&d)|(c&~d);
Variable *T2 = makeReg(SrcTy); Variable *T2 = makeReg(SrcTy);
// Sign extend the condition operand if applicable. // Sign extend the condition operand if applicable.
if (SrcTy == IceType_v4f32) { if (SrcTy == IceType_v4f32) {
...@@ -3568,11 +3616,6 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) { ...@@ -3568,11 +3616,6 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
} else { } else {
_movp(T, Condition); _movp(T, Condition);
} }
// ALIGNHACK: Until stack alignment support is implemented, the
// bitwise vector instructions need to have both operands in
// registers. Once there is support for stack alignment, LEGAL_HACK
// can be removed.
#define LEGAL_HACK(Vect) legalizeToVar((Vect))
_movp(T2, T); _movp(T2, T);
_pand(T, LEGAL_HACK(SrcT)); _pand(T, LEGAL_HACK(SrcT));
_pandn(T2, LEGAL_HACK(SrcF)); _pandn(T2, LEGAL_HACK(SrcF));
......
...@@ -71,6 +71,14 @@ public: ...@@ -71,6 +71,14 @@ public:
Reg_NUM Reg_NUM
}; };
enum X86InstructionSet {
// SSE2 is the PNaCl baseline instruction set.
SSE2,
SSE4_1
};
X86InstructionSet getInstructionSet() const { return InstructionSet; }
protected: protected:
TargetX8632(Cfg *Func); TargetX8632(Cfg *Func);
...@@ -186,6 +194,9 @@ protected: ...@@ -186,6 +194,9 @@ protected:
void _and(Variable *Dest, Operand *Src0) { void _and(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632And::create(Func, Dest, Src0)); Context.insert(InstX8632And::create(Func, Dest, Src0));
} }
void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert(InstX8632Blendvps::create(Func, Dest, Src0, Src1));
}
void _br(InstX8632::BrCond Condition, CfgNode *TargetTrue, void _br(InstX8632::BrCond Condition, CfgNode *TargetTrue,
CfgNode *TargetFalse) { CfgNode *TargetFalse) {
Context.insert( Context.insert(
...@@ -260,6 +271,9 @@ protected: ...@@ -260,6 +271,9 @@ protected:
void _imul(Variable *Dest, Operand *Src0) { void _imul(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Imul::create(Func, Dest, Src0)); Context.insert(InstX8632Imul::create(Func, Dest, Src0));
} }
void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert(InstX8632Insertps::create(Func, Dest, Src0, Src1));
}
void _lea(Variable *Dest, Operand *Src0) { void _lea(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Lea::create(Func, Dest, Src0)); Context.insert(InstX8632Lea::create(Func, Dest, Src0));
} }
...@@ -317,20 +331,23 @@ protected: ...@@ -317,20 +331,23 @@ protected:
void _pandn(Variable *Dest, Operand *Src0) { void _pandn(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pandn::create(Func, Dest, Src0)); Context.insert(InstX8632Pandn::create(Func, Dest, Src0));
} }
void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert(InstX8632Pblendvb::create(Func, Dest, Src0, Src1));
}
void _pcmpeq(Variable *Dest, Operand *Src0) { void _pcmpeq(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pcmpeq::create(Func, Dest, Src0)); Context.insert(InstX8632Pcmpeq::create(Func, Dest, Src0));
} }
void _pcmpgt(Variable *Dest, Operand *Src0) { void _pcmpgt(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pcmpgt::create(Func, Dest, Src0)); Context.insert(InstX8632Pcmpgt::create(Func, Dest, Src0));
} }
void _pextrw(Variable *Dest, Operand *Src0, Operand *Src1) { void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert(InstX8632Pextrw::create(Func, Dest, Src0, Src1)); Context.insert(InstX8632Pextr::create(Func, Dest, Src0, Src1));
} }
void _pinsrw(Variable *Dest, Operand *Src0, Operand *Src1) { void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
Context.insert(InstX8632Pinsrw::create(Func, Dest, Src0, Src1)); Context.insert(InstX8632Pinsr::create(Func, Dest, Src0, Src1));
} }
void _pmullw(Variable *Dest, Operand *Src0) { void _pmull(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pmullw::create(Func, Dest, Src0)); Context.insert(InstX8632Pmull::create(Func, Dest, Src0));
} }
void _pmuludq(Variable *Dest, Operand *Src0) { void _pmuludq(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Pmuludq::create(Func, Dest, Src0)); Context.insert(InstX8632Pmuludq::create(Func, Dest, Src0));
...@@ -428,6 +445,7 @@ protected: ...@@ -428,6 +445,7 @@ protected:
Context.insert(InstX8632Xor::create(Func, Dest, Src0)); Context.insert(InstX8632Xor::create(Func, Dest, Src0));
} }
const X86InstructionSet InstructionSet;
bool IsEbpBasedFrame; bool IsEbpBasedFrame;
size_t FrameSizeLocals; size_t FrameSizeLocals;
size_t LocalsSizeBytes; size_t LocalsSizeBytes;
......
...@@ -2,10 +2,18 @@ ...@@ -2,10 +2,18 @@
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
; RUN: | FileCheck %s --check-prefix=SSE41
; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
; RUN: | FileCheck %s --check-prefix=SSE41
; RUN: %llvm2ice -O2 --verbose none %s \ ; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj ; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 --verbose none %s \ ; RUN: %llvm2ice -Om1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj ; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \ ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
...@@ -306,6 +314,9 @@ entry: ...@@ -306,6 +314,9 @@ entry:
; CHECK-LABEL: test_mul_v4i32: ; CHECK-LABEL: test_mul_v4i32:
; CHECK: pmuludq ; CHECK: pmuludq
; CHECK: pmuludq ; CHECK: pmuludq
;
; SSE41-LABEL: test_mul_v4i32:
; SSE41: pmulld
} }
define <4 x i32> @test_shl_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) { define <4 x i32> @test_shl_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
...@@ -314,6 +325,9 @@ entry: ...@@ -314,6 +325,9 @@ entry:
ret <4 x i32> %res ret <4 x i32> %res
; CHECK-LABEL: test_shl_v4i32: ; CHECK-LABEL: test_shl_v4i32:
; CHECK: Sz_shl_v4i32 ; CHECK: Sz_shl_v4i32
; This line is to ensure that pmulld is generated in test_mul_v4i32 above.
; SSE41-LABEL: test_shl_v4i32:
} }
define <4 x i32> @test_lshr_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) { define <4 x i32> @test_lshr_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
......
; This checks support for insertelement and extractelement. ; This checks support for insertelement and extractelement.
; RUN: %llvm2ice --verbose inst %s | FileCheck %s ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
; RUN: | FileCheck %s --check-prefix=SSE41
; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
; RUN: | FileCheck %s --check-prefix=SSE41
; RUN: %llvm2ice -O2 --verbose none %s \ ; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj ; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 --verbose none %s \ ; RUN: %llvm2ice -Om1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj ; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \ ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
...@@ -18,6 +27,9 @@ entry: ...@@ -18,6 +27,9 @@ entry:
ret <4 x float> %res ret <4 x float> %res
; CHECK-LABEL: insertelement_v4f32_0: ; CHECK-LABEL: insertelement_v4f32_0:
; CHECK: movss ; CHECK: movss
; SSE41-LABEL: insertelement_v4f32_0:
; SSE41: insertps {{.*}}, {{.*}}, 0
} }
define <4 x i32> @insertelement_v4i32_0(<4 x i32> %vec, i32 %elt) { define <4 x i32> @insertelement_v4i32_0(<4 x i32> %vec, i32 %elt) {
...@@ -26,6 +38,9 @@ entry: ...@@ -26,6 +38,9 @@ entry:
ret <4 x i32> %res ret <4 x i32> %res
; CHECK-LABEL: insertelement_v4i32_0: ; CHECK-LABEL: insertelement_v4i32_0:
; CHECK: movss ; CHECK: movss
; SSE41-LABEL: insertelement_v4i32_0:
; SSE41: pinsrd {{.*}}, {{.*}}, 0
} }
...@@ -36,6 +51,9 @@ entry: ...@@ -36,6 +51,9 @@ entry:
; CHECK-LABEL: insertelement_v4f32_1: ; CHECK-LABEL: insertelement_v4f32_1:
; CHECK: shufps ; CHECK: shufps
; CHECK: shufps ; CHECK: shufps
; SSE41-LABEL: insertelement_v4f32_1:
; SSE41: insertps {{.*}}, {{.*}}, 16
} }
define <4 x i32> @insertelement_v4i32_1(<4 x i32> %vec, i32 %elt) { define <4 x i32> @insertelement_v4i32_1(<4 x i32> %vec, i32 %elt) {
...@@ -45,6 +63,9 @@ entry: ...@@ -45,6 +63,9 @@ entry:
; CHECK-LABEL: insertelement_v4i32_1: ; CHECK-LABEL: insertelement_v4i32_1:
; CHECK: shufps ; CHECK: shufps
; CHECK: shufps ; CHECK: shufps
; SSE41-LABEL: insertelement_v4i32_1:
; SSE41: pinsrd {{.*}}, {{.*}}, 1
} }
define <8 x i16> @insertelement_v8i16(<8 x i16> %vec, i32 %elt.arg) { define <8 x i16> @insertelement_v8i16(<8 x i16> %vec, i32 %elt.arg) {
...@@ -52,8 +73,11 @@ entry: ...@@ -52,8 +73,11 @@ entry:
%elt = trunc i32 %elt.arg to i16 %elt = trunc i32 %elt.arg to i16
%res = insertelement <8 x i16> %vec, i16 %elt, i32 1 %res = insertelement <8 x i16> %vec, i16 %elt, i32 1
ret <8 x i16> %res ret <8 x i16> %res
; CHECK-LABEL: insertelement_v8i16 ; CHECK-LABEL: insertelement_v8i16:
; CHECK: pinsrw ; CHECK: pinsrw
; SSE41-LABEL: insertelement_v8i16:
; SSE41: pinsrw
} }
define <16 x i8> @insertelement_v16i8(<16 x i8> %vec, i32 %elt.arg) { define <16 x i8> @insertelement_v16i8(<16 x i8> %vec, i32 %elt.arg) {
...@@ -65,6 +89,9 @@ entry: ...@@ -65,6 +89,9 @@ entry:
; CHECK: movups ; CHECK: movups
; CHECK: lea ; CHECK: lea
; CHECK: mov ; CHECK: mov
; SSE41-LABEL: insertelement_v16i8:
; SSE41: pinsrb
} }
define <4 x i1> @insertelement_v4i1_0(<4 x i1> %vec, i32 %elt.arg) { define <4 x i1> @insertelement_v4i1_0(<4 x i1> %vec, i32 %elt.arg) {
...@@ -74,6 +101,9 @@ entry: ...@@ -74,6 +101,9 @@ entry:
ret <4 x i1> %res ret <4 x i1> %res
; CHECK-LABEL: insertelement_v4i1_0: ; CHECK-LABEL: insertelement_v4i1_0:
; CHECK: movss ; CHECK: movss
; SSE41-LABEL: insertelement_v4i1_0:
; SSE41: pinsrd {{.*}}, {{.*}}, 0
} }
define <4 x i1> @insertelement_v4i1_1(<4 x i1> %vec, i32 %elt.arg) { define <4 x i1> @insertelement_v4i1_1(<4 x i1> %vec, i32 %elt.arg) {
...@@ -84,6 +114,9 @@ entry: ...@@ -84,6 +114,9 @@ entry:
; CHECK-LABEL: insertelement_v4i1_1: ; CHECK-LABEL: insertelement_v4i1_1:
; CHECK: shufps ; CHECK: shufps
; CHECK: shufps ; CHECK: shufps
; SSE41-LABEL: insertelement_v4i1_1:
; SSE41: pinsrd {{.*}}, {{.*}}, 1
} }
define <8 x i1> @insertelement_v8i1(<8 x i1> %vec, i32 %elt.arg) { define <8 x i1> @insertelement_v8i1(<8 x i1> %vec, i32 %elt.arg) {
...@@ -93,6 +126,9 @@ entry: ...@@ -93,6 +126,9 @@ entry:
ret <8 x i1> %res ret <8 x i1> %res
; CHECK-LABEL: insertelement_v8i1: ; CHECK-LABEL: insertelement_v8i1:
; CHECK: pinsrw ; CHECK: pinsrw
; SSE41-LABEL: insertelement_v8i1:
; SSE41: pinsrw
} }
define <16 x i1> @insertelement_v16i1(<16 x i1> %vec, i32 %elt.arg) { define <16 x i1> @insertelement_v16i1(<16 x i1> %vec, i32 %elt.arg) {
...@@ -104,6 +140,9 @@ entry: ...@@ -104,6 +140,9 @@ entry:
; CHECK: movups ; CHECK: movups
; CHECK: lea ; CHECK: lea
; CHECK: mov ; CHECK: mov
; SSE41-LABEL: insertelement_v16i1:
; SSE41: pinsrb
} }
; extractelement operations ; extractelement operations
...@@ -114,6 +153,9 @@ entry: ...@@ -114,6 +153,9 @@ entry:
ret float %res ret float %res
; CHECK-LABEL: extractelement_v4f32: ; CHECK-LABEL: extractelement_v4f32:
; CHECK: pshufd ; CHECK: pshufd
; SSE41-LABEL: extractelement_v4f32:
; SSE41: pshufd
} }
define i32 @extractelement_v4i32(<4 x i32> %vec) { define i32 @extractelement_v4i32(<4 x i32> %vec) {
...@@ -122,6 +164,9 @@ entry: ...@@ -122,6 +164,9 @@ entry:
ret i32 %res ret i32 %res
; CHECK-LABEL: extractelement_v4i32: ; CHECK-LABEL: extractelement_v4i32:
; CHECK: pshufd ; CHECK: pshufd
; SSE41-LABEL: extractelement_v4i32:
; SSE41: pextrd
} }
define i32 @extractelement_v8i16(<8 x i16> %vec) { define i32 @extractelement_v8i16(<8 x i16> %vec) {
...@@ -131,6 +176,9 @@ entry: ...@@ -131,6 +176,9 @@ entry:
ret i32 %res.ext ret i32 %res.ext
; CHECK-LABEL: extractelement_v8i16: ; CHECK-LABEL: extractelement_v8i16:
; CHECK: pextrw ; CHECK: pextrw
; SSE41-LABEL: extractelement_v8i16:
; SSE41: pextrw
} }
define i32 @extractelement_v16i8(<16 x i8> %vec) { define i32 @extractelement_v16i8(<16 x i8> %vec) {
...@@ -142,6 +190,9 @@ entry: ...@@ -142,6 +190,9 @@ entry:
; CHECK: movups ; CHECK: movups
; CHECK: lea ; CHECK: lea
; CHECK: mov ; CHECK: mov
; SSE41-LABEL: extractelement_v16i8:
; SSE41: pextrb
} }
define i32 @extractelement_v4i1(<4 x i1> %vec) { define i32 @extractelement_v4i1(<4 x i1> %vec) {
...@@ -151,6 +202,9 @@ entry: ...@@ -151,6 +202,9 @@ entry:
ret i32 %res.ext ret i32 %res.ext
; CHECK-LABEL: extractelement_v4i1: ; CHECK-LABEL: extractelement_v4i1:
; CHECK: pshufd ; CHECK: pshufd
; SSE41-LABEL: extractelement_v4i1:
; SSE41: pextrd
} }
define i32 @extractelement_v8i1(<8 x i1> %vec) { define i32 @extractelement_v8i1(<8 x i1> %vec) {
...@@ -160,6 +214,9 @@ entry: ...@@ -160,6 +214,9 @@ entry:
ret i32 %res.ext ret i32 %res.ext
; CHECK-LABEL: extractelement_v8i1: ; CHECK-LABEL: extractelement_v8i1:
; CHECK: pextrw ; CHECK: pextrw
; SSE41-LABEL: extractelement_v8i1:
; SSE41: pextrw
} }
define i32 @extractelement_v16i1(<16 x i1> %vec) { define i32 @extractelement_v16i1(<16 x i1> %vec) {
...@@ -171,6 +228,9 @@ entry: ...@@ -171,6 +228,9 @@ entry:
; CHECK: movups ; CHECK: movups
; CHECK: lea ; CHECK: lea
; CHECK: mov ; CHECK: mov
; SSE41-LABEL: extractelement_v16i1:
; SSE41: pextrb
} }
; ERRORS-NOT: ICE translation error ; ERRORS-NOT: ICE translation error
......
...@@ -2,10 +2,18 @@ ...@@ -2,10 +2,18 @@
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
; RUN: | FileCheck %s --check-prefix=SSE41
; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
; RUN: | FileCheck %s --check-prefix=SSE41
; RUN: %llvm2ice -O2 --verbose none %s \ ; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj ; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 --verbose none %s \ ; RUN: %llvm2ice -Om1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj ; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -O2 -mattr=sse4.1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 -mattr=sse4.1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \ ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
...@@ -19,6 +27,9 @@ entry: ...@@ -19,6 +27,9 @@ entry:
; CHECK: pand ; CHECK: pand
; CHECK: pandn ; CHECK: pandn
; CHECK: por ; CHECK: por
; SSE41-LABEL: test_select_v16i8:
; SSE41: pblendvb
} }
define <16 x i1> @test_select_v16i1(<16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2) { define <16 x i1> @test_select_v16i1(<16 x i1> %cond, <16 x i1> %arg1, <16 x i1> %arg2) {
...@@ -29,6 +40,9 @@ entry: ...@@ -29,6 +40,9 @@ entry:
; CHECK: pand ; CHECK: pand
; CHECK: pandn ; CHECK: pandn
; CHECK: por ; CHECK: por
; SSE41-LABEL: test_select_v16i1:
; SSE41: pblendvb
} }
define <8 x i16> @test_select_v8i16(<8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2) { define <8 x i16> @test_select_v8i16(<8 x i1> %cond, <8 x i16> %arg1, <8 x i16> %arg2) {
...@@ -39,6 +53,9 @@ entry: ...@@ -39,6 +53,9 @@ entry:
; CHECK: pand ; CHECK: pand
; CHECK: pandn ; CHECK: pandn
; CHECK: por ; CHECK: por
; SSE41-LABEL: test_select_v8i16:
; SSE41: pblendvb
} }
define <8 x i1> @test_select_v8i1(<8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2) { define <8 x i1> @test_select_v8i1(<8 x i1> %cond, <8 x i1> %arg1, <8 x i1> %arg2) {
...@@ -49,6 +66,9 @@ entry: ...@@ -49,6 +66,9 @@ entry:
; CHECK: pand ; CHECK: pand
; CHECK: pandn ; CHECK: pandn
; CHECK: por ; CHECK: por
; SSE41-LABEL: test_select_v8i1:
; SSE41: pblendvb
} }
define <4 x i32> @test_select_v4i32(<4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2) { define <4 x i32> @test_select_v4i32(<4 x i1> %cond, <4 x i32> %arg1, <4 x i32> %arg2) {
...@@ -59,6 +79,10 @@ entry: ...@@ -59,6 +79,10 @@ entry:
; CHECK: pand ; CHECK: pand
; CHECK: pandn ; CHECK: pandn
; CHECK: por ; CHECK: por
; SSE41-LABEL: test_select_v4i32:
; SSE41: pslld xmm0, 31
; SSE41: blendvps
} }
define <4 x float> @test_select_v4f32(<4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2) { define <4 x float> @test_select_v4f32(<4 x i1> %cond, <4 x float> %arg1, <4 x float> %arg2) {
...@@ -69,6 +93,10 @@ entry: ...@@ -69,6 +93,10 @@ entry:
; CHECK: pand ; CHECK: pand
; CHECK: pandn ; CHECK: pandn
; CHECK: por ; CHECK: por
; SSE41-LABEL: test_select_v4f32:
; SSE41: pslld xmm0, 31
; SSE41: blendvps
} }
define <4 x i1> @test_select_v4i1(<4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2) { define <4 x i1> @test_select_v4i1(<4 x i1> %cond, <4 x i1> %arg1, <4 x i1> %arg2) {
...@@ -79,6 +107,10 @@ entry: ...@@ -79,6 +107,10 @@ entry:
; CHECK: pand ; CHECK: pand
; CHECK: pandn ; CHECK: pandn
; CHECK: por ; CHECK: por
; SSE41-LABEL: test_select_v4i1:
; SSE41: pslld xmm0, 31
; SSE41: blendvps
} }
; ERRORS-NOT: ICE translation error ; ERRORS-NOT: ICE translation error
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment