Commit f79d2cb6 by Jim Stichnoth

Subzero: Don't use key SSE instructions on potentially unaligned loads.

The non-mov-like SSE instructions generally require 16-byte aligned memory operands. The PNaCl bitcode ABI only guarantees 4-byte alignment or less on vector loads and stores. Subzero maintains stack alignment so stack memory operands are fine. We handle this by legalizing memory operands into a register wherever there is doubt. This bug was first discovered on the vector_align scons test. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4083 BUG= https://code.google.com/p/nativeclient/issues/detail?id=4133 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1024253003
parent 69d3f9c6
...@@ -273,6 +273,29 @@ protected: ...@@ -273,6 +273,29 @@ protected:
static bool isClassof(const Inst *Inst, InstKindX8632 MyKind) { static bool isClassof(const Inst *Inst, InstKindX8632 MyKind) {
return Inst->getKind() == static_cast<InstKind>(MyKind); return Inst->getKind() == static_cast<InstKind>(MyKind);
} }
// Most instructions that operate on vector arguments require vector
// memory operands to be fully aligned (16-byte alignment for PNaCl
// vector types). The stack frame layout and call ABI ensure proper
// alignment for stack operands, but memory operands (originating
// from load/store bitcode instructions) only have element-size
// alignment guarantees. This function validates that none of the
// operands is a memory operand of vector type, calling
// report_fatal_error() if one is found. This function should be
// called during emission, and maybe also in the ctor (as long as
// that fits the lowering style).
void validateVectorAddrMode() const {
if (getDest())
validateVectorAddrModeOpnd(getDest());
for (SizeT i = 0; i < getSrcSize(); ++i) {
validateVectorAddrModeOpnd(getSrc(i));
}
}
private:
static void validateVectorAddrModeOpnd(const Operand *Opnd) {
if (llvm::isa<OperandX8632Mem>(Opnd) && isVectorType(Opnd->getType())) {
llvm::report_fatal_error("Possible misaligned vector memory operation");
}
}
}; };
// InstX8632Label represents an intra-block label that is the target // InstX8632Label represents an intra-block label that is the target
...@@ -752,10 +775,12 @@ public: ...@@ -752,10 +775,12 @@ public:
void emit(const Cfg *Func) const override { void emit(const Cfg *Func) const override {
if (!ALLOW_DUMP) if (!ALLOW_DUMP)
return; return;
validateVectorAddrMode();
const bool ShiftHack = false; const bool ShiftHack = false;
emitTwoAddress(Opcode, this, Func, ShiftHack); emitTwoAddress(Opcode, this, Func, ShiftHack);
} }
void emitIAS(const Cfg *Func) const override { void emitIAS(const Cfg *Func) const override {
validateVectorAddrMode();
Type Ty = getDest()->getType(); Type Ty = getDest()->getType();
if (NeedsElementType) if (NeedsElementType)
Ty = typeElementType(Ty); Ty = typeElementType(Ty);
...@@ -803,10 +828,12 @@ public: ...@@ -803,10 +828,12 @@ public:
void emit(const Cfg *Func) const override { void emit(const Cfg *Func) const override {
if (!ALLOW_DUMP) if (!ALLOW_DUMP)
return; return;
validateVectorAddrMode();
const bool ShiftHack = false; const bool ShiftHack = false;
emitTwoAddress(Opcode, this, Func, ShiftHack); emitTwoAddress(Opcode, this, Func, ShiftHack);
} }
void emitIAS(const Cfg *Func) const override { void emitIAS(const Cfg *Func) const override {
validateVectorAddrMode();
Type Ty = getDest()->getType(); Type Ty = getDest()->getType();
assert(AllowAllTypes || isVectorType(Ty)); assert(AllowAllTypes || isVectorType(Ty));
Type ElementTy = typeElementType(Ty); Type ElementTy = typeElementType(Ty);
......
...@@ -1375,6 +1375,8 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { ...@@ -1375,6 +1375,8 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
} else if (isVectorType(Dest->getType())) { } else if (isVectorType(Dest->getType())) {
// TODO: Trap on integer divide and integer modulo by zero. // TODO: Trap on integer divide and integer modulo by zero.
// See: https://code.google.com/p/nativeclient/issues/detail?id=3899 // See: https://code.google.com/p/nativeclient/issues/detail?id=3899
if (llvm::isa<OperandX8632Mem>(Src1))
Src1 = legalizeToVar(Src1);
switch (Inst->getOp()) { switch (Inst->getOp()) {
case InstArithmetic::_num: case InstArithmetic::_num:
llvm_unreachable("Unknown arithmetic operator"); llvm_unreachable("Unknown arithmetic operator");
...@@ -2090,6 +2092,8 @@ void TargetX8632::lowerCast(const InstCast *Inst) { ...@@ -2090,6 +2092,8 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
assert(Dest->getType() == IceType_v4i32 && assert(Dest->getType() == IceType_v4i32 &&
Inst->getSrc(0)->getType() == IceType_v4f32); Inst->getSrc(0)->getType() == IceType_v4f32);
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem); Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
if (llvm::isa<OperandX8632Mem>(Src0RM))
Src0RM = legalizeToVar(Src0RM);
Variable *T = makeReg(Dest->getType()); Variable *T = makeReg(Dest->getType());
_cvt(T, Src0RM, InstX8632Cvt::Tps2dq); _cvt(T, Src0RM, InstX8632Cvt::Tps2dq);
_movp(Dest, T); _movp(Dest, T);
...@@ -2165,6 +2169,8 @@ void TargetX8632::lowerCast(const InstCast *Inst) { ...@@ -2165,6 +2169,8 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
assert(Dest->getType() == IceType_v4f32 && assert(Dest->getType() == IceType_v4f32 &&
Inst->getSrc(0)->getType() == IceType_v4i32); Inst->getSrc(0)->getType() == IceType_v4i32);
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem); Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
if (llvm::isa<OperandX8632Mem>(Src0RM))
Src0RM = legalizeToVar(Src0RM);
Variable *T = makeReg(Dest->getType()); Variable *T = makeReg(Dest->getType());
_cvt(T, Src0RM, InstX8632Cvt::Dq2ps); _cvt(T, Src0RM, InstX8632Cvt::Dq2ps);
_movp(Dest, T); _movp(Dest, T);
...@@ -2472,6 +2478,8 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) { ...@@ -2472,6 +2478,8 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
} else { } else {
Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
if (llvm::isa<OperandX8632Mem>(Src1RM))
Src1RM = legalizeToVar(Src1RM);
switch (Condition) { switch (Condition) {
default: { default: {
...@@ -2609,10 +2617,14 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) { ...@@ -2609,10 +2617,14 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
llvm_unreachable("unexpected condition"); llvm_unreachable("unexpected condition");
break; break;
case InstIcmp::Eq: { case InstIcmp::Eq: {
if (llvm::isa<OperandX8632Mem>(Src1RM))
Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM); _movp(T, Src0RM);
_pcmpeq(T, Src1RM); _pcmpeq(T, Src1RM);
} break; } break;
case InstIcmp::Ne: { case InstIcmp::Ne: {
if (llvm::isa<OperandX8632Mem>(Src1RM))
Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM); _movp(T, Src0RM);
_pcmpeq(T, Src1RM); _pcmpeq(T, Src1RM);
Variable *MinusOne = makeVectorOfMinusOnes(Ty); Variable *MinusOne = makeVectorOfMinusOnes(Ty);
...@@ -2620,12 +2632,16 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) { ...@@ -2620,12 +2632,16 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
} break; } break;
case InstIcmp::Ugt: case InstIcmp::Ugt:
case InstIcmp::Sgt: { case InstIcmp::Sgt: {
if (llvm::isa<OperandX8632Mem>(Src1RM))
Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM); _movp(T, Src0RM);
_pcmpgt(T, Src1RM); _pcmpgt(T, Src1RM);
} break; } break;
case InstIcmp::Uge: case InstIcmp::Uge:
case InstIcmp::Sge: { case InstIcmp::Sge: {
// !(Src1RM > Src0RM) // !(Src1RM > Src0RM)
if (llvm::isa<OperandX8632Mem>(Src0RM))
Src0RM = legalizeToVar(Src0RM);
_movp(T, Src1RM); _movp(T, Src1RM);
_pcmpgt(T, Src0RM); _pcmpgt(T, Src0RM);
Variable *MinusOne = makeVectorOfMinusOnes(Ty); Variable *MinusOne = makeVectorOfMinusOnes(Ty);
...@@ -2633,12 +2649,16 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) { ...@@ -2633,12 +2649,16 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
} break; } break;
case InstIcmp::Ult: case InstIcmp::Ult:
case InstIcmp::Slt: { case InstIcmp::Slt: {
if (llvm::isa<OperandX8632Mem>(Src0RM))
Src0RM = legalizeToVar(Src0RM);
_movp(T, Src1RM); _movp(T, Src1RM);
_pcmpgt(T, Src0RM); _pcmpgt(T, Src0RM);
} break; } break;
case InstIcmp::Ule: case InstIcmp::Ule:
case InstIcmp::Sle: { case InstIcmp::Sle: {
// !(Src0RM > Src1RM) // !(Src0RM > Src1RM)
if (llvm::isa<OperandX8632Mem>(Src1RM))
Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM); _movp(T, Src0RM);
_pcmpgt(T, Src1RM); _pcmpgt(T, Src1RM);
Variable *MinusOne = makeVectorOfMinusOnes(Ty); Variable *MinusOne = makeVectorOfMinusOnes(Ty);
...@@ -3092,8 +3112,12 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { ...@@ -3092,8 +3112,12 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
Variable *T = makeVectorOfFabsMask(Ty); Variable *T = makeVectorOfFabsMask(Ty);
// The pand instruction operates on an m128 memory operand, so if // The pand instruction operates on an m128 memory operand, so if
// Src is an f32 or f64, we need to make sure it's in a register. // Src is an f32 or f64, we need to make sure it's in a register.
if (!isVectorType(Ty)) if (isVectorType(Ty)) {
if (llvm::isa<OperandX8632Mem>(Src))
Src = legalizeToVar(Src);
} else {
Src = legalizeToVar(Src); Src = legalizeToVar(Src);
}
_pand(T, Src); _pand(T, Src);
if (isVectorType(Ty)) if (isVectorType(Ty))
_movp(Dest, T); _movp(Dest, T);
......
...@@ -56,8 +56,9 @@ entry: ...@@ -56,8 +56,9 @@ entry:
%arg1 = load <8 x i16>* %addr_ptr, align 2 %arg1 = load <8 x i16>* %addr_ptr, align 2
%res_vec = mul <8 x i16> %arg0, %arg1 %res_vec = mul <8 x i16> %arg0, %arg1
ret <8 x i16> %res_vec ret <8 x i16> %res_vec
; Address mode optimization is generally unsafe for SSE vector instructions.
; CHECK-LABEL: load_mul_v8i16_mem ; CHECK-LABEL: load_mul_v8i16_mem
; CHECK: pmullw xmm{{.*}},XMMWORD PTR [e{{.*}}-0x30d40] ; CHECK-NOT: pmullw xmm{{.*}},XMMWORD PTR [e{{..}}-0x30d40]
} }
define <4 x i32> @load_mul_v4i32_mem(<4 x i32> %arg0, i32 %arg1_iptr) { define <4 x i32> @load_mul_v4i32_mem(<4 x i32> %arg0, i32 %arg1_iptr) {
...@@ -67,12 +68,13 @@ entry: ...@@ -67,12 +68,13 @@ entry:
%arg1 = load <4 x i32>* %addr_ptr, align 4 %arg1 = load <4 x i32>* %addr_ptr, align 4
%res = mul <4 x i32> %arg0, %arg1 %res = mul <4 x i32> %arg0, %arg1
ret <4 x i32> %res ret <4 x i32> %res
; Address mode optimization is generally unsafe for SSE vector instructions.
; CHECK-LABEL: load_mul_v4i32_mem ; CHECK-LABEL: load_mul_v4i32_mem
; CHECK: pmuludq xmm{{.*}},XMMWORD PTR [e{{.*}}-0x30d40] ; CHECK-NOT: pmuludq xmm{{.*}},XMMWORD PTR [e{{..}}-0x30d40]
; CHECK: pmuludq ; CHECK: pmuludq
; ;
; SSE41-LABEL: load_mul_v4i32_mem ; SSE41-LABEL: load_mul_v4i32_mem
; SSE41: pmulld xmm{{.*}},XMMWORD PTR [e{{.*}}-0x30d40] ; SSE41-NOT: pmulld xmm{{.*}},XMMWORD PTR [e{{..}}-0x30d40]
} }
define float @address_mode_opt_chaining(float* %arg) { define float @address_mode_opt_chaining(float* %arg) {
......
; This test checks that when SSE instructions access memory and require full
; alignment, memory operands are limited to properly aligned stack operands.
; This would only happen when we fuse a load instruction with another
; instruction, which currently only happens with non-scalarized Arithmetic
; instructions.
; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
; RUN: %p2i -i %s --filetype=obj --disassemble --args -Om1 | FileCheck %s
define <4 x i32> @test_add(i32 %addr_i, <4 x i32> %addend) {
entry:
%addr = inttoptr i32 %addr_i to <4 x i32>*
%loaded = load <4 x i32>* %addr, align 4
%result = add <4 x i32> %addend, %loaded
ret <4 x i32> %result
}
; CHECK-LABEL: test_add
; CHECK-NOT: paddd xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
; CHECK: paddd xmm{{.}},
define <4 x i32> @test_and(i32 %addr_i, <4 x i32> %addend) {
entry:
%addr = inttoptr i32 %addr_i to <4 x i32>*
%loaded = load <4 x i32>* %addr, align 4
%result = and <4 x i32> %addend, %loaded
ret <4 x i32> %result
}
; CHECK-LABEL: test_and
; CHECK-NOT: pand xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
; CHECK: pand xmm{{.}},
define <4 x i32> @test_or(i32 %addr_i, <4 x i32> %addend) {
entry:
%addr = inttoptr i32 %addr_i to <4 x i32>*
%loaded = load <4 x i32>* %addr, align 4
%result = or <4 x i32> %addend, %loaded
ret <4 x i32> %result
}
; CHECK-LABEL: test_or
; CHECK-NOT: por xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
; CHECK: por xmm{{.}},
define <4 x i32> @test_xor(i32 %addr_i, <4 x i32> %addend) {
entry:
%addr = inttoptr i32 %addr_i to <4 x i32>*
%loaded = load <4 x i32>* %addr, align 4
%result = xor <4 x i32> %addend, %loaded
ret <4 x i32> %result
}
; CHECK-LABEL: test_xor
; CHECK-NOT: pxor xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
; CHECK: pxor xmm{{.}},
define <4 x i32> @test_sub(i32 %addr_i, <4 x i32> %addend) {
entry:
%addr = inttoptr i32 %addr_i to <4 x i32>*
%loaded = load <4 x i32>* %addr, align 4
%result = sub <4 x i32> %addend, %loaded
ret <4 x i32> %result
}
; CHECK-LABEL: test_sub
; CHECK-NOT: psubd xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
; CHECK: psubd xmm{{.}},
define <4 x float> @test_fadd(i32 %addr_i, <4 x float> %addend) {
entry:
%addr = inttoptr i32 %addr_i to <4 x float>*
%loaded = load <4 x float>* %addr, align 4
%result = fadd <4 x float> %addend, %loaded
ret <4 x float> %result
}
; CHECK-LABEL: test_fadd
; CHECK-NOT: addps xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
; CHECK: addps xmm{{.}},
define <4 x float> @test_fsub(i32 %addr_i, <4 x float> %addend) {
entry:
%addr = inttoptr i32 %addr_i to <4 x float>*
%loaded = load <4 x float>* %addr, align 4
%result = fsub <4 x float> %addend, %loaded
ret <4 x float> %result
}
; CHECK-LABEL: test_fsub
; CHECK-NOT: subps xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
; CHECK: subps xmm{{.}},
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment