Commit c77f817f by Jim Stichnoth

Subzero: Fold the load instruction into the next cast instruction.

This is similar to the way a load instruction may be folded into the next arithmetic instruction. Usually the effect is to improve a sequence like: mov ax, WORD PTR [mem] movsx eax, ax into this: movsx eax, WORD PTR [mem] without actually improving register allocation, though other kinds of casts may have different improvements. Existing tests needed to be fixed when they "inadvertently" did a cast to i32 return type and triggered the optimization when it wasn't wanted. These were fixed by inserting a "dummy" instruction between the load and the cast. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1152783006
parent c207d51e
...@@ -883,7 +883,7 @@ void TargetX8632::addProlog(CfgNode *Node) { ...@@ -883,7 +883,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
// that stack slot. // that stack slot.
if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) { if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
assert(Var->getWeight().isZero()); assert(Var->getWeight().isZero());
if (!SpillVar->getLinkedTo()->hasReg()) { if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
VariablesLinkedToSpillSlots.push_back(Var); VariablesLinkedToSpillSlots.push_back(Var);
continue; continue;
} }
...@@ -1160,8 +1160,9 @@ void TargetX8632::split64(Variable *Var) { ...@@ -1160,8 +1160,9 @@ void TargetX8632::split64(Variable *Var) {
} }
Operand *TargetX8632::loOperand(Operand *Operand) { Operand *TargetX8632::loOperand(Operand *Operand) {
assert(Operand->getType() == IceType_i64); assert(Operand->getType() == IceType_i64 ||
if (Operand->getType() != IceType_i64) Operand->getType() == IceType_f64);
if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
return Operand; return Operand;
if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) { if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
split64(Var); split64(Var);
...@@ -1180,8 +1181,9 @@ Operand *TargetX8632::loOperand(Operand *Operand) { ...@@ -1180,8 +1181,9 @@ Operand *TargetX8632::loOperand(Operand *Operand) {
} }
Operand *TargetX8632::hiOperand(Operand *Operand) { Operand *TargetX8632::hiOperand(Operand *Operand) {
assert(Operand->getType() == IceType_i64); assert(Operand->getType() == IceType_i64 ||
if (Operand->getType() != IceType_i64) Operand->getType() == IceType_f64);
if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
return Operand; return Operand;
if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) { if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
split64(Var); split64(Var);
...@@ -2463,20 +2465,25 @@ void TargetX8632::lowerCast(const InstCast *Inst) { ...@@ -2463,20 +2465,25 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
// a_lo.i32 = t_lo.i32 // a_lo.i32 = t_lo.i32
// t_hi.i32 = hi(s.f64) // t_hi.i32 = hi(s.f64)
// a_hi.i32 = t_hi.i32 // a_hi.i32 = t_hi.i32
SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(IceType_f64); Operand *SpillLo, *SpillHi;
SpillVar->setLinkedTo(llvm::dyn_cast<Variable>(Src0RM)); if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
Variable *Spill = SpillVar; SpillVariable *SpillVar =
Spill->setWeight(RegWeight::Zero); Func->makeVariable<SpillVariable>(IceType_f64);
_movq(Spill, Src0RM); SpillVar->setLinkedTo(Src0Var);
Variable *Spill = SpillVar;
Spill->setWeight(RegWeight::Zero);
_movq(Spill, Src0RM);
SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
} else {
SpillLo = loOperand(Src0RM);
SpillHi = hiOperand(Src0RM);
}
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Variable *T_Lo = makeReg(IceType_i32); Variable *T_Lo = makeReg(IceType_i32);
Variable *T_Hi = makeReg(IceType_i32); Variable *T_Hi = makeReg(IceType_i32);
VariableSplit *SpillLo =
VariableSplit::create(Func, Spill, VariableSplit::Low);
VariableSplit *SpillHi =
VariableSplit::create(Func, Spill, VariableSplit::High);
_mov(T_Lo, SpillLo); _mov(T_Lo, SpillLo);
_mov(DestLo, T_Lo); _mov(DestLo, T_Lo);
...@@ -2486,6 +2493,12 @@ void TargetX8632::lowerCast(const InstCast *Inst) { ...@@ -2486,6 +2493,12 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
case IceType_f64: { case IceType_f64: {
Src0 = legalize(Src0); Src0 = legalize(Src0);
assert(Src0->getType() == IceType_i64); assert(Src0->getType() == IceType_i64);
if (llvm::isa<OperandX8632Mem>(Src0)) {
Variable *T = Func->makeVariable(Dest->getType());
_movq(T, Src0);
_movq(Dest, T);
break;
}
// a.f64 = bitcast b.i64 ==> // a.f64 = bitcast b.i64 ==>
// t_lo.i32 = b_lo.i32 // t_lo.i32 = b_lo.i32
// FakeDef(s.f64) // FakeDef(s.f64)
...@@ -3955,20 +3968,23 @@ void computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *&Base, ...@@ -3955,20 +3968,23 @@ void computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *&Base,
} // anonymous namespace } // anonymous namespace
void TargetX8632::lowerLoad(const InstLoad *Inst) { void TargetX8632::lowerLoad(const InstLoad *Load) {
// A Load instruction can be treated the same as an Assign // A Load instruction can be treated the same as an Assign
// instruction, after the source operand is transformed into an // instruction, after the source operand is transformed into an
// OperandX8632Mem operand. Note that the address mode // OperandX8632Mem operand. Note that the address mode
// optimization already creates an OperandX8632Mem operand, so it // optimization already creates an OperandX8632Mem operand, so it
// doesn't need another level of transformation. // doesn't need another level of transformation.
Type Ty = Inst->getDest()->getType(); Type Ty = Load->getDest()->getType();
Operand *Src0 = FormMemoryOperand(Inst->getSourceAddress(), Ty); Operand *Src0 = FormMemoryOperand(Load->getSourceAddress(), Ty);
// Fuse this load with a subsequent Arithmetic instruction in the // Fuse this load with a subsequent Arithmetic instruction in the
// following situations: // following situations:
// a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b // a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b
// a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true // a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
// //
// Fuse this load with a subsequent Cast instruction:
// a=[mem]; b=cast(a) ==> b=cast([mem]) if last use of a
//
// TODO: Clean up and test thoroughly. // TODO: Clean up and test thoroughly.
// (E.g., if there is an mfence-all make sure the load ends up on the // (E.g., if there is an mfence-all make sure the load ends up on the
// same side of the fence). // same side of the fence).
...@@ -3979,30 +3995,46 @@ void TargetX8632::lowerLoad(const InstLoad *Inst) { ...@@ -3979,30 +3995,46 @@ void TargetX8632::lowerLoad(const InstLoad *Inst) {
// load instruction's dest variable, and that instruction ends that // load instruction's dest variable, and that instruction ends that
// variable's live range, then make the substitution. Deal with // variable's live range, then make the substitution. Deal with
// commutativity optimization in the arithmetic instruction lowering. // commutativity optimization in the arithmetic instruction lowering.
InstArithmetic *NewArith = nullptr; //
if (InstArithmetic *Arith = // TODO(stichnot): Do load fusing as a separate pass. Run it before
llvm::dyn_cast_or_null<InstArithmetic>(Context.getNextInst())) { // the bool folding pass. Modify Ice::Inst to allow src operands to
Variable *DestLoad = Inst->getDest(); // be replaced, including updating Inst::LiveRangesEnded, to avoid
Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0)); // having to manually mostly clone each instruction type.
Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1)); Inst *NextInst = Context.getNextInst();
if (Src1Arith == DestLoad && Arith->isLastUse(Src1Arith) && Variable *DestLoad = Load->getDest();
DestLoad != Src0Arith) { if (NextInst && NextInst->isLastUse(DestLoad)) {
NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(), if (auto *Arith = llvm::dyn_cast<InstArithmetic>(NextInst)) {
Arith->getSrc(0), Src0); InstArithmetic *NewArith = nullptr;
} else if (Src0Arith == DestLoad && Arith->isCommutative() && Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
Arith->isLastUse(Src0Arith) && DestLoad != Src1Arith) { Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(), if (Src1Arith == DestLoad && DestLoad != Src0Arith) {
Arith->getSrc(1), Src0); NewArith = InstArithmetic::create(
} Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(0), Src0);
if (NewArith) { } else if (Src0Arith == DestLoad && Arith->isCommutative() &&
Arith->setDeleted(); DestLoad != Src1Arith) {
Context.advanceNext(); NewArith = InstArithmetic::create(
lowerArithmetic(NewArith); Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(1), Src0);
return; }
if (NewArith) {
Arith->setDeleted();
Context.advanceNext();
lowerArithmetic(NewArith);
return;
}
} else if (auto *Cast = llvm::dyn_cast<InstCast>(NextInst)) {
Variable *Src0Cast = llvm::dyn_cast<Variable>(Cast->getSrc(0));
if (Src0Cast == DestLoad) {
InstCast *NewCast =
InstCast::create(Func, Cast->getCastKind(), Cast->getDest(), Src0);
Cast->setDeleted();
Context.advanceNext();
lowerCast(NewCast);
return;
}
} }
} }
InstAssign *Assign = InstAssign::create(Func, Inst->getDest(), Src0); InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
lowerAssign(Assign); lowerAssign(Assign);
} }
......
...@@ -335,7 +335,8 @@ define i32 @load_i8(i32 %addr_arg) { ...@@ -335,7 +335,8 @@ define i32 @load_i8(i32 %addr_arg) {
entry: entry:
%addr = inttoptr i32 %addr_arg to i8* %addr = inttoptr i32 %addr_arg to i8*
%ret = load i8* %addr, align 1 %ret = load i8* %addr, align 1
%ret_ext = zext i8 %ret to i32 %ret2 = sub i8 %ret, 0
%ret_ext = zext i8 %ret2 to i32
ret i32 %ret_ext ret i32 %ret_ext
} }
; CHECK-LABEL: load_i8 ; CHECK-LABEL: load_i8
...@@ -345,7 +346,8 @@ define i32 @load_i8_global(i32 %addr_arg) { ...@@ -345,7 +346,8 @@ define i32 @load_i8_global(i32 %addr_arg) {
entry: entry:
%addr = bitcast [1 x i8]* @global8 to i8* %addr = bitcast [1 x i8]* @global8 to i8*
%ret = load i8* %addr, align 1 %ret = load i8* %addr, align 1
%ret_ext = zext i8 %ret to i32 %ret2 = sub i8 %ret, 0
%ret_ext = zext i8 %ret2 to i32
ret i32 %ret_ext ret i32 %ret_ext
} }
; CHECK-LABEL: load_i8_global ; CHECK-LABEL: load_i8_global
......
; Tests desired and undesired folding of load instructions into cast
; instructions. The folding is only done when liveness analysis is performed,
; so only O2 is tested.
; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
; Not testing trunc, or 32-bit bitcast, because the lowered code uses pretty
; much the same mov instructions regardless of whether folding is done.
define internal i32 @zext_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i8*
%load = load i8* %addr, align 1
%result = zext i8 %load to i32
ret i32 %result
}
; CHECK-LABEL: zext_fold
; CHECK: movzx {{.*}},BYTE PTR [{{.*}}+0xc8]
define internal i32 @zext_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i8*
%load = load i8* %addr, align 1
%tmp1 = zext i8 %load to i32
%tmp2 = zext i8 %load to i32
%result = add i32 %tmp1, %tmp2
ret i32 %result
}
; Test that load folding does not happen.
; CHECK-LABEL: zext_nofold
; CHECK-NOT: movzx {{.*}},BYTE PTR [{{.*}}+0xc8]
define internal i32 @sext_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i8*
%load = load i8* %addr, align 1
%result = sext i8 %load to i32
ret i32 %result
}
; CHECK-LABEL: sext_fold
; CHECK: movsx {{.*}},BYTE PTR [{{.*}}+0xc8]
define internal i32 @sext_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i8*
%load = load i8* %addr, align 1
%tmp1 = sext i8 %load to i32
%tmp2 = sext i8 %load to i32
%result = add i32 %tmp1, %tmp2
ret i32 %result
}
; Test that load folding does not happen.
; CHECK-LABEL: sext_nofold
; CHECK-NOT: movsx {{.*}},BYTE PTR [{{.*}}+0xc8]
define internal float @fptrunc_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%result = fptrunc double %load to float
ret float %result
}
; CHECK-LABEL: fptrunc_fold
; CHECK: cvtsd2ss {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal float @fptrunc_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%tmp1 = fptrunc double %load to float
%tmp2 = fptrunc double %load to float
%result = fadd float %tmp1, %tmp2
ret float %result
}
; Test that load folding does not happen.
; CHECK-LABEL: fptrunc_nofold
; CHECK-NOT: cvtsd2ss {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal double @fpext_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to float*
%load = load float* %addr, align 4
%result = fpext float %load to double
ret double %result
}
; CHECK-LABEL: fpext_fold
; CHECK: cvtss2sd {{.*}},DWORD PTR [{{.*}}+0xc8]
define internal double @fpext_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to float*
%load = load float* %addr, align 4
%tmp1 = fpext float %load to double
%tmp2 = fpext float %load to double
%result = fadd double %tmp1, %tmp2
ret double %result
}
; Test that load folding does not happen.
; CHECK-LABEL: fpext_nofold
; CHECK-NOT: cvtss2sd {{.*}},DWORD PTR [{{.*}}+0xc8]
define internal i32 @fptoui_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%result = fptoui double %load to i16
%result2 = zext i16 %result to i32
ret i32 %result2
}
; CHECK-LABEL: fptoui_fold
; CHECK: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal i32 @fptoui_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%tmp1 = fptoui double %load to i16
%tmp2 = fptoui double %load to i16
%result = add i16 %tmp1, %tmp2
%result2 = zext i16 %result to i32
ret i32 %result2
}
; Test that load folding does not happen.
; CHECK-LABEL: fptoui_nofold
; CHECK-NOT: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal i32 @fptosi_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%result = fptosi double %load to i16
%result2 = zext i16 %result to i32
ret i32 %result2
}
; CHECK-LABEL: fptosi_fold
; CHECK: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal i32 @fptosi_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%tmp1 = fptosi double %load to i16
%tmp2 = fptosi double %load to i16
%result = add i16 %tmp1, %tmp2
%result2 = zext i16 %result to i32
ret i32 %result2
}
; Test that load folding does not happen.
; CHECK-LABEL: fptosi_nofold
; CHECK-NOT: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal double @uitofp_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i16*
%load = load i16* %addr, align 1
%result = uitofp i16 %load to double
ret double %result
}
; CHECK-LABEL: uitofp_fold
; CHECK: movzx {{.*}},WORD PTR [{{.*}}+0xc8]
define internal double @uitofp_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i16*
%load = load i16* %addr, align 1
%tmp1 = uitofp i16 %load to double
%tmp2 = uitofp i16 %load to double
%result = fadd double %tmp1, %tmp2
ret double %result
}
; Test that load folding does not happen.
; CHECK-LABEL: uitofp_nofold
; CHECK-NOT: movzx {{.*}},WORD PTR [{{.*}}+0xc8]
define internal double @sitofp_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i16*
%load = load i16* %addr, align 1
%result = sitofp i16 %load to double
ret double %result
}
; CHECK-LABEL: sitofp_fold
; CHECK: movsx {{.*}},WORD PTR [{{.*}}+0xc8]
define internal double @sitofp_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i16*
%load = load i16* %addr, align 1
%tmp1 = sitofp i16 %load to double
%tmp2 = sitofp i16 %load to double
%result = fadd double %tmp1, %tmp2
ret double %result
}
; Test that load folding does not happen.
; CHECK-LABEL: sitofp_nofold
; CHECK-NOT: movsx {{.*}},WORD PTR [{{.*}}+0xc8]
define internal double @bitcast_i64_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i64*
%load = load i64* %addr, align 1
%result = bitcast i64 %load to double
ret double %result
}
; CHECK-LABEL: bitcast_i64_fold
; CHECK: movq {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal double @bitcast_i64_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to i64*
%load = load i64* %addr, align 1
%tmp1 = bitcast i64 %load to double
%tmp2 = bitcast i64 %load to double
%result = fadd double %tmp1, %tmp2
ret double %result
}
; Test that load folding does not happen.
; CHECK-LABEL: bitcast_i64_nofold
; CHECK-NOT: movq {{.*}},QWORD PTR [{{.*}}+0xc8]
define internal i64 @bitcast_double_fold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%result = bitcast double %load to i64
ret i64 %result
}
; CHECK-LABEL: bitcast_double_fold
; CHECK-NOT: QWORD PTR
; CHECK: mov {{.*}},DWORD PTR [{{.*}}+0xc8]
; CHECK: mov {{.*}},DWORD PTR [{{.*}}+0xcc]
; CHECK-NOT: QWORD PTR
define internal i64 @bitcast_double_nofold(i32 %arg) {
entry:
%ptr = add i32 %arg, 200
%addr = inttoptr i32 %ptr to double*
%load = load double* %addr, align 8
%tmp1 = bitcast double %load to i64
%tmp2 = bitcast double %load to i64
%result = add i64 %tmp1, %tmp2
ret i64 %result
}
; Test that load folding does not happen.
; CHECK-LABEL: bitcast_double_nofold
; CHECK: QWORD PTR
; CHECK: QWORD PTR
...@@ -48,7 +48,8 @@ entry: ...@@ -48,7 +48,8 @@ entry:
%ptr = inttoptr i32 %iptr to i8* %ptr = inttoptr i32 %iptr to i8*
; parameter value "6" is for the sequential consistency memory order. ; parameter value "6" is for the sequential consistency memory order.
%i = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 6) %i = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 6)
%r = zext i8 %i to i32 %i2 = sub i8 %i, 0
%r = zext i8 %i2 to i32
ret i32 %r ret i32 %r
} }
; CHECK-LABEL: test_atomic_load_8 ; CHECK-LABEL: test_atomic_load_8
...@@ -59,7 +60,8 @@ define i32 @test_atomic_load_16(i32 %iptr) { ...@@ -59,7 +60,8 @@ define i32 @test_atomic_load_16(i32 %iptr) {
entry: entry:
%ptr = inttoptr i32 %iptr to i16* %ptr = inttoptr i32 %iptr to i16*
%i = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 6) %i = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 6)
%r = zext i16 %i to i32 %i2 = sub i16 %i, 0
%r = zext i16 %i2 to i32
ret i32 %r ret i32 %r
} }
; CHECK-LABEL: test_atomic_load_16 ; CHECK-LABEL: test_atomic_load_16
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment