Commit 55f931f6 by Jim Stichnoth

Subzero: Improve handling of alloca instructions of constant size.

PNaCl simplifies varargs calls by creating a known-size argument array with an alloca instruction, and passing the address of that argument array. These alloca instructions don't necessarily require use of a frame pointer, freeing up the frame pointer register for normal register allocation. These varargs calls sometimes show up in cold paths of hot functions, so increasing the number of registers available to the register allocator can produce tangible gains. This patch does a simple recognition of these alloca patterns, and on x86 doesn't force a frame pointer if all alloca instructions are suitable. Future work is to avoid saving the alloca result as a local variable, and instead rematerialize the address as needed with respect to the stack or frame pointer. BUG= none R=jpp@chromium.org Review URL: https://codereview.chromium.org/1361803002 .
parent 467a222f
...@@ -201,6 +201,27 @@ void Cfg::translate() { ...@@ -201,6 +201,27 @@ void Cfg::translate() {
if (auto Var64On32 = llvm::dyn_cast<Variable64On32>(Var)) if (auto Var64On32 = llvm::dyn_cast<Variable64On32>(Var))
Var64On32->initHiLo(this); Var64On32->initHiLo(this);
// Figure out which alloca instructions result in storage at known stack frame
// offsets. If this is true for all alloca instructions, then a stack pointer
// can still be used instead of a frame pointer, freeing up the frame pointer
// for normal register allocation. Additionally, for each such alloca, its
// address could be rematerialized at each use in terms of the stack/frame
// pointer, saving a stack slot and a load from that stack slot.
//
// This simple implementation is limited to alloca instructions at the start
// of the entry node.
for (Inst &Instr : getEntryNode()->getInsts()) {
if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
if (llvm::isa<Constant>(Alloca->getSizeInBytes())) {
Alloca->setKnownFrameOffset();
continue;
}
}
// The first instruction that is not an alloca with a constant size stops
// the search.
break;
}
// The set of translation passes and their order are determined by the // The set of translation passes and their order are determined by the
// target. // target.
getTarget()->translate(); getTarget()->translate();
......
...@@ -246,6 +246,8 @@ public: ...@@ -246,6 +246,8 @@ public:
} }
uint32_t getAlignInBytes() const { return AlignInBytes; } uint32_t getAlignInBytes() const { return AlignInBytes; }
Operand *getSizeInBytes() const { return getSrc(0); } Operand *getSizeInBytes() const { return getSrc(0); }
bool getKnownFrameOffset() const { return KnownFrameOffset; }
void setKnownFrameOffset() { KnownFrameOffset = true; }
void dump(const Cfg *Func) const override; void dump(const Cfg *Func) const override;
static bool classof(const Inst *Inst) { return Inst->getKind() == Alloca; } static bool classof(const Inst *Inst) { return Inst->getKind() == Alloca; }
...@@ -254,6 +256,7 @@ private: ...@@ -254,6 +256,7 @@ private:
Variable *Dest); Variable *Dest);
const uint32_t AlignInBytes; const uint32_t AlignInBytes;
bool KnownFrameOffset = false;
}; };
/// Binary arithmetic instruction. The source operands are captured in getSrc(0) /// Binary arithmetic instruction. The source operands are captured in getSrc(0)
......
...@@ -359,7 +359,8 @@ protected: ...@@ -359,7 +359,8 @@ protected:
bool HasComputedFrame = false; bool HasComputedFrame = false;
bool CallsReturnsTwice = false; bool CallsReturnsTwice = false;
/// StackAdjustment keeps track of the current stack offset from its natural /// StackAdjustment keeps track of the current stack offset from its natural
/// location, as arguments are pushed for a function call. /// location, e.g. as arguments are pushed for a function call or as
/// fixed-size alloca instructions are executed in the entry block.
int32_t StackAdjustment = 0; int32_t StackAdjustment = 0;
SizeT NextLabelNumber = 0; SizeT NextLabelNumber = 0;
SizeT NextJumpTableNumber = 0; SizeT NextJumpTableNumber = 0;
......
...@@ -487,9 +487,16 @@ void TargetX8632::addProlog(CfgNode *Node) { ...@@ -487,9 +487,16 @@ void TargetX8632::addProlog(CfgNode *Node) {
if (SpillAreaSizeBytes) if (SpillAreaSizeBytes)
_sub(getPhysicalRegister(Traits::RegisterSet::Reg_esp), _sub(getPhysicalRegister(Traits::RegisterSet::Reg_esp),
Ctx->getConstantInt32(SpillAreaSizeBytes)); Ctx->getConstantInt32(SpillAreaSizeBytes));
// Account for alloca instructions with known frame offsets.
SpillAreaSizeBytes += FixedAllocaSizeBytes;
Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes); Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
// Initialize the stack adjustment so that after all the known-frame-offset
// alloca instructions are emitted, the stack adjustment will reach zero.
resetStackAdjustment(); resetStackAdjustment();
updateStackAdjustment(-FixedAllocaSizeBytes);
// Fill in stack offsets for stack args, and copy args into registers for // Fill in stack offsets for stack args, and copy args into registers for
// those that were register-allocated. Args are pushed right to left, so // those that were register-allocated. Args are pushed right to left, so
...@@ -509,7 +516,13 @@ void TargetX8632::addProlog(CfgNode *Node) { ...@@ -509,7 +516,13 @@ void TargetX8632::addProlog(CfgNode *Node) {
++NumXmmArgs; ++NumXmmArgs;
continue; continue;
} }
finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes); // For esp-based frames, the esp value may not stabilize to its home value
// until after all the fixed-size alloca instructions have executed. In
// this case, a stack adjustment is needed when accessing in-args in order
// to copy them into registers.
size_t StackAdjBytes = IsEbpBasedFrame ? 0 : -FixedAllocaSizeBytes;
finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
InArgsSizeBytes);
} }
// Fill in stack offsets for locals. // Fill in stack offsets for locals.
......
...@@ -512,9 +512,16 @@ void TargetX8664::addProlog(CfgNode *Node) { ...@@ -512,9 +512,16 @@ void TargetX8664::addProlog(CfgNode *Node) {
if (SpillAreaSizeBytes) if (SpillAreaSizeBytes)
_sub(getPhysicalRegister(Traits::RegisterSet::Reg_esp), _sub(getPhysicalRegister(Traits::RegisterSet::Reg_esp),
Ctx->getConstantInt32(SpillAreaSizeBytes)); Ctx->getConstantInt32(SpillAreaSizeBytes));
// Account for alloca instructions with known frame offsets.
SpillAreaSizeBytes += FixedAllocaSizeBytes;
Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes); Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
// Initialize the stack adjustment so that after all the known-frame-offset
// alloca instructions are emitted, the stack adjustment will reach zero.
resetStackAdjustment(); resetStackAdjustment();
updateStackAdjustment(-FixedAllocaSizeBytes);
// Fill in stack offsets for stack args, and copy args into registers for // Fill in stack offsets for stack args, and copy args into registers for
// those that were register-allocated. Args are pushed right to left, so // those that were register-allocated. Args are pushed right to left, so
...@@ -543,7 +550,13 @@ void TargetX8664::addProlog(CfgNode *Node) { ...@@ -543,7 +550,13 @@ void TargetX8664::addProlog(CfgNode *Node) {
continue; continue;
} }
} }
finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes); // For esp-based frames, the esp value may not stabilize to its home value
// until after all the fixed-size alloca instructions have executed. In
// this case, a stack adjustment is needed when accessing in-args in order
// to copy them into registers.
size_t StackAdjBytes = IsEbpBasedFrame ? 0 : -FixedAllocaSizeBytes;
finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
InArgsSizeBytes);
} }
// Fill in stack offsets for locals. // Fill in stack offsets for locals.
......
...@@ -128,7 +128,8 @@ public: ...@@ -128,7 +128,8 @@ public:
} }
void finishArgumentLowering(Variable *Arg, Variable *FramePtr, void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
size_t BasicFrameOffset, size_t &InArgsSizeBytes); size_t BasicFrameOffset, size_t StackAdjBytes,
size_t &InArgsSizeBytes);
typename Traits::Address stackVarToAsmOperand(const Variable *Var) const; typename Traits::Address stackVarToAsmOperand(const Variable *Var) const;
typename Traits::InstructionSet getInstructionSet() const { typename Traits::InstructionSet getInstructionSet() const {
...@@ -675,6 +676,7 @@ protected: ...@@ -675,6 +676,7 @@ protected:
bool IsEbpBasedFrame = false; bool IsEbpBasedFrame = false;
bool NeedsStackAlignment = false; bool NeedsStackAlignment = false;
size_t SpillAreaSizeBytes = 0; size_t SpillAreaSizeBytes = 0;
size_t FixedAllocaSizeBytes = 0;
std::array<llvm::SmallBitVector, IceType_NUM> TypeToRegisterSet; std::array<llvm::SmallBitVector, IceType_NUM> TypeToRegisterSet;
std::array<llvm::SmallBitVector, Traits::RegisterSet::Reg_NUM> std::array<llvm::SmallBitVector, Traits::RegisterSet::Reg_NUM>
RegisterAliases; RegisterAliases;
......
...@@ -791,13 +791,16 @@ template <class Machine> ...@@ -791,13 +791,16 @@ template <class Machine>
void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg, void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg,
Variable *FramePtr, Variable *FramePtr,
size_t BasicFrameOffset, size_t BasicFrameOffset,
size_t StackAdjBytes,
size_t &InArgsSizeBytes) { size_t &InArgsSizeBytes) {
if (!Traits::Is64Bit) { if (!Traits::Is64Bit) {
if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) { if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
Variable *Lo = Arg64On32->getLo(); Variable *Lo = Arg64On32->getLo();
Variable *Hi = Arg64On32->getHi(); Variable *Hi = Arg64On32->getHi();
finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes); finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes); InArgsSizeBytes);
finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
InArgsSizeBytes);
return; return;
} }
} }
...@@ -810,7 +813,8 @@ void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg, ...@@ -810,7 +813,8 @@ void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg,
if (Arg->hasReg()) { if (Arg->hasReg()) {
assert(Ty != IceType_i64 || Traits::Is64Bit); assert(Ty != IceType_i64 || Traits::Is64Bit);
typename Traits::X86OperandMem *Mem = Traits::X86OperandMem::create( typename Traits::X86OperandMem *Mem = Traits::X86OperandMem::create(
Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset())); Func, Ty, FramePtr,
Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
if (isVectorType(Arg->getType())) { if (isVectorType(Arg->getType())) {
_movp(Arg, Mem); _movp(Arg, Mem);
} else { } else {
...@@ -905,6 +909,7 @@ TargetX86Base<Machine>::getRegisterSet(RegSetMask Include, ...@@ -905,6 +909,7 @@ TargetX86Base<Machine>::getRegisterSet(RegSetMask Include,
template <class Machine> template <class Machine>
void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) { void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) {
if (!Inst->getKnownFrameOffset())
IsEbpBasedFrame = true; IsEbpBasedFrame = true;
// Conservatively require the stack to be aligned. Some stack adjustment // Conservatively require the stack to be aligned. Some stack adjustment
// operations implemented below assume that the stack is aligned before the // operations implemented below assume that the stack is aligned before the
...@@ -935,7 +940,12 @@ void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) { ...@@ -935,7 +940,12 @@ void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) {
llvm::dyn_cast<ConstantInteger32>(TotalSize)) { llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
uint32_t Value = ConstantTotalSize->getValue(); uint32_t Value = ConstantTotalSize->getValue();
Value = Utils::applyAlignment(Value, Alignment); Value = Utils::applyAlignment(Value, Alignment);
if (Inst->getKnownFrameOffset()) {
_adjust_stack(Value);
FixedAllocaSizeBytes += Value;
} else {
_sub(esp, Ctx->getConstantInt32(Value)); _sub(esp, Ctx->getConstantInt32(Value));
}
} else { } else {
// Non-constant sizes need to be adjusted to the next highest multiple of // Non-constant sizes need to be adjusted to the next highest multiple of
// the required alignment at runtime. // the required alignment at runtime.
......
...@@ -40,6 +40,8 @@ declare void @ForceXmmSpills() ...@@ -40,6 +40,8 @@ declare void @ForceXmmSpills()
define <4 x i32> @align_global_vector_ebp_based(i32 %arg) { define <4 x i32> @align_global_vector_ebp_based(i32 %arg) {
entry: entry:
br label %eblock ; Disable alloca optimization
eblock:
%alloc = alloca i8, i32 1, align 1 %alloc = alloca i8, i32 1, align 1
%vec.global = insertelement <4 x i32> undef, i32 %arg, i32 0 %vec.global = insertelement <4 x i32> undef, i32 %arg, i32 0
br label %block br label %block
...@@ -55,6 +57,8 @@ block: ...@@ -55,6 +57,8 @@ block:
define <4 x i32> @align_local_vector_ebp_based(i32 %arg) { define <4 x i32> @align_local_vector_ebp_based(i32 %arg) {
entry: entry:
br label %eblock ; Disable alloca optimization
eblock:
%alloc = alloca i8, i32 1, align 1 %alloc = alloca i8, i32 1, align 1
%vec.local = insertelement <4 x i32> undef, i32 %arg, i32 0 %vec.local = insertelement <4 x i32> undef, i32 %arg, i32 0
call void @ForceXmmSpillsAndUseAlloca(i8* %alloc) call void @ForceXmmSpillsAndUseAlloca(i8* %alloc)
......
...@@ -221,3 +221,37 @@ entry: ...@@ -221,3 +221,37 @@ entry:
; ARM32: add r0, r0, [[REG]] ; ARM32: add r0, r0, [[REG]]
; ARM32: and r0, r0, #-536870912 ; 0xe0000000 ; ARM32: and r0, r0, #-536870912 ; 0xe0000000
; ARM32: sub sp, sp, r0 ; ARM32: sub sp, sp, r0
; Test that a simple alloca sequence doesn't trigger a frame pointer.
define void @fixed_no_frameptr(i32 %arg) {
entry:
%a1 = alloca i8, i32 8, align 4
%a2 = alloca i8, i32 12, align 4
%a3 = alloca i8, i32 16, align 4
%p1 = bitcast i8* %a1 to i32*
%p2 = bitcast i8* %a2 to i32*
%p3 = bitcast i8* %a3 to i32*
store i32 %arg, i32* %p1, align 1
store i32 %arg, i32* %p2, align 1
store i32 %arg, i32* %p3, align 1
ret void
}
; CHECK-LABEL: fixed_no_frameptr
; CHECK-NOT: mov ebp,esp
; Test that a more complex alloca sequence does trigger a frame pointer.
define void @var_with_frameptr(i32 %arg) {
entry:
%a1 = alloca i8, i32 8, align 4
%a2 = alloca i8, i32 12, align 4
%a3 = alloca i8, i32 %arg, align 4
%p1 = bitcast i8* %a1 to i32*
%p2 = bitcast i8* %a2 to i32*
%p3 = bitcast i8* %a3 to i32*
store i32 %arg, i32* %p1, align 1
store i32 %arg, i32* %p2, align 1
store i32 %arg, i32* %p3, align 1
ret void
}
; CHECK-LABEL: var_with_frameptr
; CHECK: mov ebp,esp
...@@ -9,6 +9,8 @@ declare i32 @memcpy_helper2(i32 %buf, i32 %buf2, i32 %n) ...@@ -9,6 +9,8 @@ declare i32 @memcpy_helper2(i32 %buf, i32 %buf2, i32 %n)
define i32 @memcpy_helper(i32 %buf, i32 %n) { define i32 @memcpy_helper(i32 %buf, i32 %n) {
entry: entry:
br label %eblock ; Disable alloca optimization
eblock:
%buf2 = alloca i8, i32 128, align 4 %buf2 = alloca i8, i32 128, align 4
%n.arg_trunc = trunc i32 %n to i8 %n.arg_trunc = trunc i32 %n to i8
%arg.ext = zext i8 %n.arg_trunc to i32 %arg.ext = zext i8 %n.arg_trunc to i32
......
...@@ -265,6 +265,8 @@ declare void @use_ptr(i32 %iptr) ...@@ -265,6 +265,8 @@ declare void @use_ptr(i32 %iptr)
define i64 @test_atomic_rmw_add_64_alloca(i32 %iptr, i64 %v) { define i64 @test_atomic_rmw_add_64_alloca(i32 %iptr, i64 %v) {
entry: entry:
br label %eblock ; Disable alloca optimization
eblock:
%alloca_ptr = alloca i8, i32 16, align 16 %alloca_ptr = alloca i8, i32 16, align 16
%ptr = inttoptr i32 %iptr to i64* %ptr = inttoptr i32 %iptr to i64*
%old = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 6) %old = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 6)
...@@ -805,6 +807,8 @@ entry: ...@@ -805,6 +807,8 @@ entry:
; used to manage the stack frame, so it cannot be used as a register either. ; used to manage the stack frame, so it cannot be used as a register either.
define i64 @test_atomic_cmpxchg_64_alloca(i32 %iptr, i64 %expected, i64 %desired) { define i64 @test_atomic_cmpxchg_64_alloca(i32 %iptr, i64 %expected, i64 %desired) {
entry: entry:
br label %eblock ; Disable alloca optimization
eblock:
%alloca_ptr = alloca i8, i32 16, align 16 %alloca_ptr = alloca i8, i32 16, align 16
%ptr = inttoptr i32 %iptr to i64* %ptr = inttoptr i32 %iptr to i64*
%old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected, %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment