Commit 9df4a379 by Andrew Scull

Inline memcpy for small constant sizes.

Combined with memset inlining, this has shown an improvement of over 11% on the eon benchmark. This the only C++ program in spec2k and it seems C++ programs have a significantly larger number of memset/memcpy calls. Other benchmarks also showed improvement of ~5% (perlbmk, parser) while most had a 1-2% improvement. This commit also includes a refactoring of lowerMemset which is much more readable and also removed the fake use of the destination pointer register. BUG= R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/1279833005.
parent f9df4523
...@@ -153,7 +153,9 @@ protected: ...@@ -153,7 +153,9 @@ protected:
Operand *Val); Operand *Val);
void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal, void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
Operand *SecondVal); Operand *SecondVal);
/// Replace a function call with inline instructions. /// Replace a call to memcpy with inline instructions.
void lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count);
/// Replace a call to memset with inline instructions.
void lowerMemset(Operand *Dest, Operand *Val, Operand *Count); void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);
/// Lower an indirect jump adding sandboxing when needed. /// Lower an indirect jump adding sandboxing when needed.
......
...@@ -3030,13 +3030,7 @@ void TargetX86Base<Machine>::lowerIntrinsicCall( ...@@ -3030,13 +3030,7 @@ void TargetX86Base<Machine>::lowerIntrinsicCall(
return; return;
} }
case Intrinsics::Memcpy: { case Intrinsics::Memcpy: {
// In the future, we could potentially emit an inline memcpy/memset, etc. lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
// for intrinsic calls w/ a known length.
InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
Call->addArg(Instr->getArg(0));
Call->addArg(Instr->getArg(1));
Call->addArg(Instr->getArg(2));
lowerCall(Call);
return; return;
} }
case Intrinsics::Memmove: { case Intrinsics::Memmove: {
...@@ -3487,9 +3481,123 @@ void TargetX86Base<Machine>::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, ...@@ -3487,9 +3481,123 @@ void TargetX86Base<Machine>::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
} }
template <class Machine> template <class Machine>
void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,
Operand *Count) {
// There is a load and store for each chunk in the unroll
constexpr uint32_t UNROLL_LIMIT = 8;
constexpr uint32_t BytesPerStorep = 16;
constexpr uint32_t BytesPerStoreq = 8;
constexpr uint32_t BytesPerStorei32 = 4;
constexpr uint32_t BytesPerStorei16 = 2;
constexpr uint32_t BytesPerStorei8 = 1;
// Check if the operands are constants
const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
const bool IsCountConst = CountConst != nullptr;
const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {
// Unlikely, but nothing to do if it does happen
if (CountValue == 0)
return;
Variable *SrcBase = legalizeToReg(Src);
Variable *DestBase = legalizeToReg(Dest);
auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) {
Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
// TODO(ascull): this or add nullptr test to _movp, _movq
Variable *Data = makeReg(Ty);
// TODO(ascull): is 64-bit better with vector or scalar movq?
auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset);
if (isVectorType(Ty))
_movp(Data, SrcMem);
else if (Ty == IceType_f64)
_movq(Data, SrcMem);
else
_mov(Data, SrcMem);
auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);
if (isVectorType(Ty))
_storep(Data, DestMem);
else if (Ty == IceType_f64)
_storeq(Data, DestMem);
else
_store(Data, DestMem);
};
// Lowers the assignment to the remaining bytes. Assumes the original size
// was large enough to allow for overlaps.
auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {
if (Size > BytesPerStoreq) {
lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);
} else if (Size > BytesPerStorei32) {
lowerCopy(IceType_f64, CountValue - BytesPerStoreq);
} else if (Size > BytesPerStorei16) {
lowerCopy(IceType_i32, CountValue - BytesPerStorei32);
} else if (Size > BytesPerStorei8) {
lowerCopy(IceType_i16, CountValue - BytesPerStorei16);
} else if (Size == BytesPerStorei8) {
lowerCopy(IceType_i8, CountValue - BytesPerStorei8);
}
};
if (CountValue >= BytesPerStorep) {
// Use large vector operations
for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
N -= BytesPerStorep;
lowerCopy(IceType_v16i8, N);
}
lowerLeftOvers(CountValue & 0xF);
return;
}
// Too small to use large vector operations so use small ones instead
if (CountValue >= BytesPerStoreq) {
lowerCopy(IceType_f64, 0);
lowerLeftOvers(CountValue - BytesPerStoreq);
return;
}
// Too small for vector operations so use scalar ones
if (CountValue >= BytesPerStorei32) {
lowerCopy(IceType_i32, 0);
lowerLeftOvers(CountValue - BytesPerStorei32);
return;
}
// 3 is the awkward size as it is too small for the vector or 32-bit
// operations and will not work with lowerLeftOvers as there is no valid
// overlap.
if (CountValue == 3) {
lowerCopy(IceType_i16, 0);
lowerCopy(IceType_i8, 2);
return;
}
// 1 or 2 can be done in a single scalar copy
lowerLeftOvers(CountValue);
return;
}
// Fall back on a function call
InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
Call->addArg(Dest);
Call->addArg(Src);
Call->addArg(Count);
lowerCall(Call);
}
template <class Machine>
void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
Operand *Count) { Operand *Count) {
constexpr uint32_t UNROLL_LIMIT = 16; constexpr uint32_t UNROLL_LIMIT = 16;
constexpr uint32_t BytesPerStorep = 16;
constexpr uint32_t BytesPerStoreq = 8;
constexpr uint32_t BytesPerStorei32 = 4;
constexpr uint32_t BytesPerStorei16 = 2;
constexpr uint32_t BytesPerStorei8 = 1;
assert(Val->getType() == IceType_i8); assert(Val->getType() == IceType_i8);
// Check if the operands are constants // Check if the operands are constants
...@@ -3508,109 +3616,86 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, ...@@ -3508,109 +3616,86 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
// to inline by spreading the value across 4 bytes and accessing subregs e.g. // to inline by spreading the value across 4 bytes and accessing subregs e.g.
// eax, ax and al. // eax, ax and al.
if (IsCountConst && IsValConst) { if (IsCountConst && IsValConst) {
Variable *Base = legalizeToReg(Dest); Variable *Base = nullptr;
// Add a FakeUse in case Base is ultimately not used, e.g. it falls back to const uint32_t SpreadValue =
// calling memset(). Otherwise Om1 register allocation fails because this (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
// infinite-weight variable has a definition but no uses. Variable *VecReg = nullptr;
Context.insert(InstFakeUse::create(Func, Base));
// 3 is the awkward size as it is too small for the vector or 32-bit
// operations and will not work with lowerLeftOvers as there is no valid
// overlap.
if (CountValue == 3) {
Constant *Offset = nullptr;
auto *Mem =
Traits::X86OperandMem::create(Func, IceType_i16, Base, Offset);
_store(Ctx->getConstantInt16((ValValue << 8) | ValValue), Mem);
Offset = Ctx->getConstantInt8(2); auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
Mem = Traits::X86OperandMem::create(Func, IceType_i8, Base, Offset); uint32_t OffsetAmt) {
_store(Ctx->getConstantInt8(ValValue), Mem); assert(Base != nullptr);
return; Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
}
// Lowers the assignment to the remaining bytes. Assumes the original size // TODO(ascull): is 64-bit better with vector or scalar movq?
// was large enough to allow for overlaps.
auto lowerLeftOvers = [this, Base, CountValue](
uint32_t SpreadValue, uint32_t Size, Variable *VecReg) {
auto lowerStoreSpreadValue =
[this, Base, CountValue, SpreadValue](Type Ty) {
Constant *Offset =
Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
_store(Ctx->getConstantInt(Ty, SpreadValue), Mem); if (isVectorType(Ty)) {
};
if (Size > 8) {
assert(VecReg != nullptr); assert(VecReg != nullptr);
Constant *Offset = Ctx->getConstantInt32(CountValue - 16);
auto *Mem = Traits::X86OperandMem::create(Func, VecReg->getType(), Base,
Offset);
_storep(VecReg, Mem); _storep(VecReg, Mem);
} else if (Size > 4) { } else if (Ty == IceType_i64) {
assert(VecReg != nullptr); assert(VecReg != nullptr);
Constant *Offset = Ctx->getConstantInt32(CountValue - 8);
auto *Mem =
Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);
_storeq(VecReg, Mem); _storeq(VecReg, Mem);
} else if (Size > 2) { } else {
lowerStoreSpreadValue(IceType_i32); _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
} else if (Size > 1) {
lowerStoreSpreadValue(IceType_i16);
} else if (Size == 1) {
lowerStoreSpreadValue(IceType_i8);
} }
}; };
// When the value is zero it can be loaded into a register cheaply using // Lowers the assignment to the remaining bytes. Assumes the original size
// the xor trick. // was large enough to allow for overlaps.
constexpr uint32_t BytesPerStorep = 16; auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) {
if (ValValue == 0 && CountValue >= 8 && if (Size > BytesPerStoreq) {
lowerSet(IceType_v16i8, CountValue - BytesPerStorep);
} else if (Size > BytesPerStorei32) {
lowerSet(IceType_i64, CountValue - BytesPerStoreq);
} else if (Size > BytesPerStorei16) {
lowerSet(IceType_i32, CountValue - BytesPerStorei32);
} else if (Size > BytesPerStorei8) {
lowerSet(IceType_i16, CountValue - BytesPerStorei16);
} else if (Size == BytesPerStorei8) {
lowerSet(IceType_i8, CountValue - BytesPerStorei8);
}
};
// When the value is zero it can be loaded into a vector register cheaply
// using the xor trick.
if (ValValue == 0 && CountValue >= BytesPerStoreq &&
CountValue <= BytesPerStorep * UNROLL_LIMIT) { CountValue <= BytesPerStorep * UNROLL_LIMIT) {
Variable *Zero = makeVectorOfZeros(IceType_v16i8); Base = legalizeToReg(Dest);
VecReg = makeVectorOfZeros(IceType_v16i8);
// Too small to use large vector operations so use small ones instead // Too small to use large vector operations so use small ones instead
if (CountValue < 16) { if (CountValue < BytesPerStorep) {
Constant *Offset = nullptr; lowerSet(IceType_i64, 0);
auto *Mem = lowerLeftOvers(CountValue - BytesPerStoreq);
Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);
_storeq(Zero, Mem);
lowerLeftOvers(0, CountValue - 8, Zero);
return; return;
} }
assert(CountValue >= 16);
// Use large vector operations // Use large vector operations
for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
N -= 16; N -= 16;
Constant *Offset = Ctx->getConstantInt32(N); lowerSet(IceType_v16i8, N);
auto *Mem =
Traits::X86OperandMem::create(Func, Zero->getType(), Base, Offset);
_storep(Zero, Mem);
} }
uint32_t LeftOver = CountValue & 0xF; lowerLeftOvers(CountValue & 0xF);
lowerLeftOvers(0, LeftOver, Zero);
return; return;
} }
// TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?
constexpr uint32_t BytesPerStore = 4; if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {
if (CountValue <= BytesPerStore * UNROLL_LIMIT) { Base = legalizeToReg(Dest);
// 3 is the awkward size as it is too small for the vector or 32-bit
// operations and will not work with lowerLeftOvers as there is no valid
// overlap.
if (CountValue == 3) {
lowerSet(IceType_i16, 0);
lowerSet(IceType_i8, 2);
return;
}
// TODO(ascull); 64-bit can do better with 64-bit mov // TODO(ascull); 64-bit can do better with 64-bit mov
uint32_t SpreadValue =
(ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
if (CountValue >= 4) {
Constant *ValueConst = Ctx->getConstantInt32(SpreadValue);
for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) { for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
N -= 4; N -= 4;
Constant *Offset = Ctx->getConstantInt32(N); lowerSet(IceType_i32, N);
auto *Mem =
Traits::X86OperandMem::create(Func, IceType_i32, Base, Offset);
_store(ValueConst, Mem);
}
} }
uint32_t LeftOver = CountValue & 0x3; lowerLeftOvers(CountValue & 0x3);
lowerLeftOvers(SpreadValue, LeftOver, nullptr);
return; return;
} }
} }
......
...@@ -30,21 +30,123 @@ entry: ...@@ -30,21 +30,123 @@ entry:
; ARM32-LABEL: test_memcpy ; ARM32-LABEL: test_memcpy
; ARM32: bl {{.*}} memcpy ; ARM32: bl {{.*}} memcpy
; TODO(jvoung) -- if we want to be clever, we can do this and the memmove, define void @test_memcpy_long_const_len(i32 %iptr_dst, i32 %iptr_src) {
; memset without a function call.
define void @test_memcpy_const_len_align(i32 %iptr_dst, i32 %iptr_src) {
entry: entry:
%dst = inttoptr i32 %iptr_dst to i8* %dst = inttoptr i32 %iptr_dst to i8*
%src = inttoptr i32 %iptr_src to i8* %src = inttoptr i32 %iptr_src to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
i32 32, i32 1, i1 false) i32 4876, i32 1, i1 false)
ret void ret void
} }
; CHECK-LABEL: test_memcpy_const_len_align ; CHECK-LABEL: test_memcpy_long_const_len
; CHECK: call {{.*}} R_{{.*}} memcpy ; CHECK: call {{.*}} R_{{.*}} memcpy
; ARM32-LABEL: test_memcpy_const_len_align ; ARM32-LABEL: test_memcpy_long_const_len
; ARM32: bl {{.*}} memcpy
define void @test_memcpy_very_small_const_len(i32 %iptr_dst, i32 %iptr_src) {
entry:
%dst = inttoptr i32 %iptr_dst to i8*
%src = inttoptr i32 %iptr_src to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
i32 2, i32 1, i1 false)
ret void
}
; CHECK-LABEL: test_memcpy_very_small_const_len
; CHECK: mov [[REG:[^,]*]],WORD PTR [{{.*}}]
; CHECK-NEXT: mov WORD PTR [{{.*}}],[[REG]]
; CHECK-NOT: mov
; ARM32-LABEL: test_memcpy_very_small_const_len
; ARM32: bl {{.*}} memcpy
define void @test_memcpy_const_len_3(i32 %iptr_dst, i32 %iptr_src) {
entry:
%dst = inttoptr i32 %iptr_dst to i8*
%src = inttoptr i32 %iptr_src to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
i32 3, i32 1, i1 false)
ret void
}
; CHECK-LABEL: test_memcpy_const_len_3
; CHECK: mov [[REG:[^,]*]],WORD PTR [{{.*}}]
; CHECK-NEXT: mov WORD PTR [{{.*}}],[[REG]]
; CHECK-NEXT: mov [[REG:[^,]*]],BYTE PTR [{{.*}}+0x2]
; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x2],[[REG]]
; CHECK-NOT: mov
; ARM32-LABEL: test_memcpy_const_len_3
; ARM32: bl {{.*}} memcpy
define void @test_memcpy_mid_const_len(i32 %iptr_dst, i32 %iptr_src) {
entry:
%dst = inttoptr i32 %iptr_dst to i8*
%src = inttoptr i32 %iptr_src to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
i32 9, i32 1, i1 false)
ret void
}
; CHECK-LABEL: test_memcpy_mid_const_len
; CHECK: movq [[REG:xmm[0-9]+]],QWORD PTR [{{.*}}]
; CHECK-NEXT: movq QWORD PTR [{{.*}}],[[REG]]
; CHECK-NEXT: mov [[REG:[^,]*]],BYTE PTR [{{.*}}+0x8]
; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x8],[[REG]]
; CHECK-NOT: mov
; ARM32-LABEL: test_memcpy_mid_const_len
; ARM32: bl {{.*}} memcpy
define void @test_memcpy_mid_const_len_overlap(i32 %iptr_dst, i32 %iptr_src) {
entry:
%dst = inttoptr i32 %iptr_dst to i8*
%src = inttoptr i32 %iptr_src to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
i32 15, i32 1, i1 false)
ret void
}
; CHECK-LABEL: test_memcpy_mid_const_len_overlap
; CHECK: movq [[REG:xmm[0-9]+]],QWORD PTR [{{.*}}]
; CHECK-NEXT: movq QWORD PTR [{{.*}}],[[REG]]
; CHECK-NEXT: movq [[REG:xmm[0-9]+]],QWORD PTR [{{.*}}+0x7]
; CHECK-NEXT: movq QWORD PTR [{{.*}}+0x7],[[REG]]
; CHECK-NOT: mov
; ARM32-LABEL: test_memcpy_mid_const_len_overlap
; ARM32: bl {{.*}} memcpy
define void @test_memcpy_large_const_len_overlap(i32 %iptr_dst, i32 %iptr_src) {
entry:
%dst = inttoptr i32 %iptr_dst to i8*
%src = inttoptr i32 %iptr_src to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
i32 30, i32 1, i1 false)
ret void
}
; CHECK-LABEL: test_memcpy_large_const_len_overlap
; CHECK: movups [[REG:xmm[0-9]+]],XMMWORD PTR [{{.*}}]
; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[REG]]
; CHECK-NEXT: movups [[REG:xmm[0-9]+]],XMMWORD PTR [{{.*}}+0xe]
; CHECK-NEXT: movups XMMWORD PTR [{{.*}}+0xe],[[REG]]
; CHECK-NOT: mov
; ARM32-LABEL: test_memcpy_large_const_len_overlap
; ARM32: bl {{.*}} memcpy
define void @test_memcpy_large_const_len(i32 %iptr_dst, i32 %iptr_src) {
entry:
%dst = inttoptr i32 %iptr_dst to i8*
%src = inttoptr i32 %iptr_src to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
i32 33, i32 1, i1 false)
ret void
}
; CHECK-LABEL: test_memcpy_large_const_len
; CHECK: movups [[REG:xmm[0-9]+]],XMMWORD PTR [{{.*}}+0x10]
; CHECK-NEXT: movups XMMWORD PTR [{{.*}}+0x10],[[REG]]
; CHECK-NEXT: movups [[REG:xmm[0-9]+]],XMMWORD PTR [{{.*}}]
; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[REG]]
; CHECK-NEXT: mov [[REG:[^,]*]],BYTE PTR [{{.*}}+0x20]
; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x20],[[REG]]
; CHECK-NOT: mov
; ARM32-LABEL: test_memcpy_large_const_len
; ARM32: bl {{.*}} memcpy ; ARM32: bl {{.*}} memcpy
; TODO(jvoung) -- if we want to be clever, we can do memset without a function
; call similar to memcpy.
define void @test_memmove(i32 %iptr_dst, i32 %iptr_src, i32 %len) { define void @test_memmove(i32 %iptr_dst, i32 %iptr_src, i32 %len) {
entry: entry:
%dst = inttoptr i32 %iptr_dst to i8* %dst = inttoptr i32 %iptr_dst to i8*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment