Commit c77f817f by Jim Stichnoth

Subzero: Fold the load instruction into the next cast instruction.

This is similar to the way a load instruction may be folded into the next arithmetic instruction. Usually the effect is to improve a sequence like: mov ax, WORD PTR [mem] movsx eax, ax into this: movsx eax, WORD PTR [mem] without actually improving register allocation, though other kinds of casts may have different improvements. Existing tests needed to be fixed when they "inadvertently" did a cast to i32 return type and triggered the optimization when it wasn't wanted. These were fixed by inserting a "dummy" instruction between the load and the cast. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1152783006
parent c207d51e
......@@ -883,7 +883,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
// that stack slot.
if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
assert(Var->getWeight().isZero());
if (!SpillVar->getLinkedTo()->hasReg()) {
if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
VariablesLinkedToSpillSlots.push_back(Var);
continue;
}
......@@ -1160,8 +1160,9 @@ void TargetX8632::split64(Variable *Var) {
}
Operand *TargetX8632::loOperand(Operand *Operand) {
assert(Operand->getType() == IceType_i64);
if (Operand->getType() != IceType_i64)
assert(Operand->getType() == IceType_i64 ||
Operand->getType() == IceType_f64);
if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
return Operand;
if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
split64(Var);
......@@ -1180,8 +1181,9 @@ Operand *TargetX8632::loOperand(Operand *Operand) {
}
Operand *TargetX8632::hiOperand(Operand *Operand) {
assert(Operand->getType() == IceType_i64);
if (Operand->getType() != IceType_i64)
assert(Operand->getType() == IceType_i64 ||
Operand->getType() == IceType_f64);
if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
return Operand;
if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
split64(Var);
......@@ -2463,20 +2465,25 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
// a_lo.i32 = t_lo.i32
// t_hi.i32 = hi(s.f64)
// a_hi.i32 = t_hi.i32
SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(IceType_f64);
SpillVar->setLinkedTo(llvm::dyn_cast<Variable>(Src0RM));
Variable *Spill = SpillVar;
Spill->setWeight(RegWeight::Zero);
_movq(Spill, Src0RM);
Operand *SpillLo, *SpillHi;
if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
SpillVariable *SpillVar =
Func->makeVariable<SpillVariable>(IceType_f64);
SpillVar->setLinkedTo(Src0Var);
Variable *Spill = SpillVar;
Spill->setWeight(RegWeight::Zero);
_movq(Spill, Src0RM);
SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
} else {
SpillLo = loOperand(Src0RM);
SpillHi = hiOperand(Src0RM);
}
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Variable *T_Lo = makeReg(IceType_i32);
Variable *T_Hi = makeReg(IceType_i32);
VariableSplit *SpillLo =
VariableSplit::create(Func, Spill, VariableSplit::Low);
VariableSplit *SpillHi =
VariableSplit::create(Func, Spill, VariableSplit::High);
_mov(T_Lo, SpillLo);
_mov(DestLo, T_Lo);
......@@ -2486,6 +2493,12 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
case IceType_f64: {
Src0 = legalize(Src0);
assert(Src0->getType() == IceType_i64);
if (llvm::isa<OperandX8632Mem>(Src0)) {
Variable *T = Func->makeVariable(Dest->getType());
_movq(T, Src0);
_movq(Dest, T);
break;
}
// a.f64 = bitcast b.i64 ==>
// t_lo.i32 = b_lo.i32
// FakeDef(s.f64)
......@@ -3955,20 +3968,23 @@ void computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *&Base,
} // anonymous namespace
void TargetX8632::lowerLoad(const InstLoad *Inst) {
void TargetX8632::lowerLoad(const InstLoad *Load) {
// A Load instruction can be treated the same as an Assign
// instruction, after the source operand is transformed into an
// OperandX8632Mem operand. Note that the address mode
// optimization already creates an OperandX8632Mem operand, so it
// doesn't need another level of transformation.
Type Ty = Inst->getDest()->getType();
Operand *Src0 = FormMemoryOperand(Inst->getSourceAddress(), Ty);
Type Ty = Load->getDest()->getType();
Operand *Src0 = FormMemoryOperand(Load->getSourceAddress(), Ty);
// Fuse this load with a subsequent Arithmetic instruction in the
// following situations:
// a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b
// a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
//
// Fuse this load with a subsequent Cast instruction:
// a=[mem]; b=cast(a) ==> b=cast([mem]) if last use of a
//
// TODO: Clean up and test thoroughly.
// (E.g., if there is an mfence-all make sure the load ends up on the
// same side of the fence).
......@@ -3979,30 +3995,46 @@ void TargetX8632::lowerLoad(const InstLoad *Inst) {
// load instruction's dest variable, and that instruction ends that
// variable's live range, then make the substitution. Deal with
// commutativity optimization in the arithmetic instruction lowering.
InstArithmetic *NewArith = nullptr;
if (InstArithmetic *Arith =
llvm::dyn_cast_or_null<InstArithmetic>(Context.getNextInst())) {
Variable *DestLoad = Inst->getDest();
Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
if (Src1Arith == DestLoad && Arith->isLastUse(Src1Arith) &&
DestLoad != Src0Arith) {
NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(),
Arith->getSrc(0), Src0);
} else if (Src0Arith == DestLoad && Arith->isCommutative() &&
Arith->isLastUse(Src0Arith) && DestLoad != Src1Arith) {
NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(),
Arith->getSrc(1), Src0);
}
if (NewArith) {
Arith->setDeleted();
Context.advanceNext();
lowerArithmetic(NewArith);
return;
//
// TODO(stichnot): Do load fusing as a separate pass. Run it before
// the bool folding pass. Modify Ice::Inst to allow src operands to
// be replaced, including updating Inst::LiveRangesEnded, to avoid
// having to manually mostly clone each instruction type.
Inst *NextInst = Context.getNextInst();
Variable *DestLoad = Load->getDest();
if (NextInst && NextInst->isLastUse(DestLoad)) {
if (auto *Arith = llvm::dyn_cast<InstArithmetic>(NextInst)) {
InstArithmetic *NewArith = nullptr;
Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
if (Src1Arith == DestLoad && DestLoad != Src0Arith) {
NewArith = InstArithmetic::create(
Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(0), Src0);
} else if (Src0Arith == DestLoad && Arith->isCommutative() &&
DestLoad != Src1Arith) {
NewArith = InstArithmetic::create(
Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(1), Src0);
}
if (NewArith) {
Arith->setDeleted();
Context.advanceNext();
lowerArithmetic(NewArith);
return;
}
} else if (auto *Cast = llvm::dyn_cast<InstCast>(NextInst)) {
Variable *Src0Cast = llvm::dyn_cast<Variable>(Cast->getSrc(0));
if (Src0Cast == DestLoad) {
InstCast *NewCast =
InstCast::create(Func, Cast->getCastKind(), Cast->getDest(), Src0);
Cast->setDeleted();
Context.advanceNext();
lowerCast(NewCast);
return;
}
}
}
InstAssign *Assign = InstAssign::create(Func, Inst->getDest(), Src0);
InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
lowerAssign(Assign);
}
......
......@@ -335,7 +335,8 @@ define i32 @load_i8(i32 %addr_arg) {
entry:
%addr = inttoptr i32 %addr_arg to i8*
%ret = load i8* %addr, align 1
%ret_ext = zext i8 %ret to i32
%ret2 = sub i8 %ret, 0
%ret_ext = zext i8 %ret2 to i32
ret i32 %ret_ext
}
; CHECK-LABEL: load_i8
......@@ -345,7 +346,8 @@ define i32 @load_i8_global(i32 %addr_arg) {
entry:
%addr = bitcast [1 x i8]* @global8 to i8*
%ret = load i8* %addr, align 1
%ret_ext = zext i8 %ret to i32
%ret2 = sub i8 %ret, 0
%ret_ext = zext i8 %ret2 to i32
ret i32 %ret_ext
}
; CHECK-LABEL: load_i8_global
......
; Tests desired and undesired folding of load instructions into cast
; instructions. The folding is only done when liveness analysis is performed,
; so only O2 is tested.
; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
; Not testing trunc, or 32-bit bitcast, because the lowered code uses pretty
; much the same mov instructions regardless of whether folding is done.
define internal i32 @zext_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i8*
%load = load i8* %addr, align 1
%result = zext i8 %load to i32
ret i32 %result
}
; CHECK-LABEL: zext_fold
; CHECK: movzx {{.*}},BYTE PTR [{{.*}}+0xc8]
define internal i32 @zext_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i8*
%load = load i8* %addr, align 1
%tmp1 = zext i8 %load to i32
%tmp2 = zext i8 %load to i32
%result = add i32 %tmp1, %tmp2
ret i32 %result
}
; Test that load folding does not happen.
; CHECK-LABEL: zext_nofold
; CHECK-NOT: movzx {{.*}},BYTE PTR [{{.*}}+0xc8]
define internal i32 @sext_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i8*
%load = load i8* %addr, align 1
%result = sext i8 %load to i32
ret i32 %result
}
; CHECK-LABEL: sext_fold
; CHECK: movsx {{.*}},BYTE PTR [{{.*}}+0xc8]
define internal i32 @sext_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i8*
%load = load i8* %addr, align 1
%tmp1 = sext i8 %load to i32
%tmp2 = sext i8 %load to i32
%result = add i32 %tmp1, %tmp2
ret i32 %result
}
; Test that load folding does not happen.
; CHECK-LABEL: sext_nofold
; CHECK-NOT: movsx {{.*}},BYTE PTR [{{.*}}+0xc8]
define internal float @fptrunc_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%result = fptrunc double %load to float
ret float %result
}
; CHECK-LABEL: fptrunc_fold
; CHECK: cvtsd2ss {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal float @fptrunc_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%tmp1 = fptrunc double %load to float
%tmp2 = fptrunc double %load to float
%result = fadd float %tmp1, %tmp2
ret float %result
}
; Test that load folding does not happen.
; CHECK-LABEL: fptrunc_nofold
; CHECK-NOT: cvtsd2ss {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal double @fpext_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to float*
%load = load float* %addr, align 4
%result = fpext float %load to double
ret double %result
}
; CHECK-LABEL: fpext_fold
; CHECK: cvtss2sd {{.*}},DWORD PTR [{{.*}}+0xc8]
define internal double @fpext_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to float*
%load = load float* %addr, align 4
%tmp1 = fpext float %load to double
%tmp2 = fpext float %load to double
%result = fadd double %tmp1, %tmp2
ret double %result
}
; Test that load folding does not happen.
; CHECK-LABEL: fpext_nofold
; CHECK-NOT: cvtss2sd {{.*}},DWORD PTR [{{.*}}+0xc8]
define internal i32 @fptoui_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%result = fptoui double %load to i16
%result2 = zext i16 %result to i32
ret i32 %result2
}
; CHECK-LABEL: fptoui_fold
; CHECK: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal i32 @fptoui_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%tmp1 = fptoui double %load to i16
%tmp2 = fptoui double %load to i16
%result = add i16 %tmp1, %tmp2
%result2 = zext i16 %result to i32
ret i32 %result2
}
; Test that load folding does not happen.
; CHECK-LABEL: fptoui_nofold
; CHECK-NOT: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal i32 @fptosi_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%result = fptosi double %load to i16
%result2 = zext i16 %result to i32
ret i32 %result2
}
; CHECK-LABEL: fptosi_fold
; CHECK: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal i32 @fptosi_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%tmp1 = fptosi double %load to i16
%tmp2 = fptosi double %load to i16
%result = add i16 %tmp1, %tmp2
%result2 = zext i16 %result to i32
ret i32 %result2
}
; Test that load folding does not happen.
; CHECK-LABEL: fptosi_nofold
; CHECK-NOT: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal double @uitofp_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i16*
%load = load i16* %addr, align 1
%result = uitofp i16 %load to double
ret double %result
}
; CHECK-LABEL: uitofp_fold
; CHECK: movzx {{.*}},WORD PTR [{{.*}}+0xc8]
define internal double @uitofp_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i16*
%load = load i16* %addr, align 1
%tmp1 = uitofp i16 %load to double
%tmp2 = uitofp i16 %load to double
%result = fadd double %tmp1, %tmp2
ret double %result
}
; Test that load folding does not happen.
; CHECK-LABEL: uitofp_nofold
; CHECK-NOT: movzx {{.*}},WORD PTR [{{.*}}+0xc8]
define internal double @sitofp_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i16*
%load = load i16* %addr, align 1
%result = sitofp i16 %load to double
ret double %result
}
; CHECK-LABEL: sitofp_fold
; CHECK: movsx {{.*}},WORD PTR [{{.*}}+0xc8]
define internal double @sitofp_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i16*
%load = load i16* %addr, align 1
%tmp1 = sitofp i16 %load to double
%tmp2 = sitofp i16 %load to double
%result = fadd double %tmp1, %tmp2
ret double %result
}
; Test that load folding does not happen.
; CHECK-LABEL: sitofp_nofold
; CHECK-NOT: movsx {{.*}},WORD PTR [{{.*}}+0xc8]
define internal double @bitcast_i64_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i64*
%load = load i64* %addr, align 1
%result = bitcast i64 %load to double
ret double %result
}
; CHECK-LABEL: bitcast_i64_fold
; CHECK: movq {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal double @bitcast_i64_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i64*
%load = load i64* %addr, align 1
%tmp1 = bitcast i64 %load to double
%tmp2 = bitcast i64 %load to double
%result = fadd double %tmp1, %tmp2
ret double %result
}
; Test that load folding does not happen.
; CHECK-LABEL: bitcast_i64_nofold
; CHECK-NOT: movq {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal i64 @bitcast_double_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%result = bitcast double %load to i64
ret i64 %result
}
; CHECK-LABEL: bitcast_double_fold
; CHECK-NOT: QWORD PTR
; CHECK: mov {{.*}},DWORD PTR [{{.*}}+0xc8]
; CHECK: mov {{.*}},DWORD PTR [{{.*}}+0xcc]
; CHECK-NOT: QWORD PTR
define internal i64 @bitcast_double_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%tmp1 = bitcast double %load to i64
%tmp2 = bitcast double %load to i64
%result = add i64 %tmp1, %tmp2
ret i64 %result
}
; Test that load folding does not happen.
; CHECK-LABEL: bitcast_double_nofold
; CHECK: QWORD PTR
; CHECK: QWORD PTR
......@@ -48,7 +48,8 @@ entry:
%ptr = inttoptr i32 %iptr to i8*
; parameter value "6" is for the sequential consistency memory order.
%i = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 6)
%r = zext i8 %i to i32
%i2 = sub i8 %i, 0
%r = zext i8 %i2 to i32
ret i32 %r
}
; CHECK-LABEL: test_atomic_load_8
......@@ -59,7 +60,8 @@ define i32 @test_atomic_load_16(i32 %iptr) {
entry:
%ptr = inttoptr i32 %iptr to i16*
%i = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 6)
%r = zext i16 %i to i32
%i2 = sub i16 %i, 0
%r = zext i16 %i2 to i32
ret i32 %r
}
; CHECK-LABEL: test_atomic_load_16
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment