Subzero: Improve handling of alloca instructions of constant size.

PNaCl simplifies varargs calls by creating a known-size argument array with an alloca instruction, and passing the address of that argument array. These alloca instructions don't necessarily require use of a frame pointer, freeing up the frame pointer register for normal register allocation. These varargs calls sometimes show up in cold paths of hot functions, so increasing the number of registers available to the register allocator can produce tangible gains. This patch does a simple recognition of these alloca patterns, and on x86 doesn't force a frame pointer if all alloca instructions are suitable. Future work is to avoid saving the alloca result as a local variable, and instead rematerialize the address as needed with respect to the stack or frame pointer. BUG= none R=jpp@chromium.org Review URL: https://codereview.chromium.org/1361803002 .

Subzero: Improve handling of alloca instructions of constant size.
55f931f6 · Jim Stichnoth · 467a222f · 55f931f6 · 55f931f6 · 55f931f6
Commit 55f931f6 authored Sep 23, 2015 by Jim Stichnoth
11 changed files
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -201,6 +201,27 @@ void Cfg::translate() {
    if (auto Var64On32 = llvm::dyn_cast<Variable64On32>(Var))
      Var64On32->initHiLo(this);
+  // Figure out which alloca instructions result in storage at known stack frame
+  // offsets.  If this is true for all alloca instructions, then a stack pointer
+  // can still be used instead of a frame pointer, freeing up the frame pointer
+  // for normal register allocation.  Additionally, for each such alloca, its
+  // address could be rematerialized at each use in terms of the stack/frame
+  // pointer, saving a stack slot and a load from that stack slot.
+  //
+  // This simple implementation is limited to alloca instructions at the start
+  // of the entry node.
+  for (Inst &Instr : getEntryNode()->getInsts()) {
+    if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
+      if (llvm::isa<Constant>(Alloca->getSizeInBytes())) {
+        Alloca->setKnownFrameOffset();
+        continue;
+      }
+    }
+    // The first instruction that is not an alloca with a constant size stops
+    // the search.
+    break;
+  }
  // The set of translation passes and their order are determined by the
  // target.
  getTarget()->translate();

--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -246,6 +246,8 @@ public:
  }
  uint32_t getAlignInBytes() const { return AlignInBytes; }
  Operand *getSizeInBytes() const { return getSrc(0); }
+  bool getKnownFrameOffset() const { return KnownFrameOffset; }
+  void setKnownFrameOffset() { KnownFrameOffset = true; }
  void dump(const Cfg *Func) const override;
  static bool classof(const Inst *Inst) { return Inst->getKind() == Alloca; }
@@ -254,6 +256,7 @@ private:
             Variable *Dest);
  const uint32_t AlignInBytes;
+  bool KnownFrameOffset = false;
 };
 /// Binary arithmetic instruction. The source operands are captured in getSrc(0)

--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -359,7 +359,8 @@ protected:
  bool HasComputedFrame = false;
  bool CallsReturnsTwice = false;
  /// StackAdjustment keeps track of the current stack offset from its natural
-  /// location, as arguments are pushed for a function call.
+  /// location, e.g. as arguments are pushed for a function call or as
+  /// fixed-size alloca instructions are executed in the entry block.
  int32_t StackAdjustment = 0;
  SizeT NextLabelNumber = 0;
  SizeT NextJumpTableNumber = 0;

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -487,9 +487,16 @@ void TargetX8632::addProlog(CfgNode *Node) {
  if (SpillAreaSizeBytes)
    _sub(getPhysicalRegister(Traits::RegisterSet::Reg_esp),
         Ctx->getConstantInt32(SpillAreaSizeBytes));
+  // Account for alloca instructions with known frame offsets.
+  SpillAreaSizeBytes += FixedAllocaSizeBytes;
  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
+  // Initialize the stack adjustment so that after all the known-frame-offset
+  // alloca instructions are emitted, the stack adjustment will reach zero.
  resetStackAdjustment();
+  updateStackAdjustment(-FixedAllocaSizeBytes);
  // Fill in stack offsets for stack args, and copy args into registers for
  // those that were register-allocated. Args are pushed right to left, so
@@ -509,7 +516,13 @@ void TargetX8632::addProlog(CfgNode *Node) {
      ++NumXmmArgs;
      continue;
    }
-    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+    // For esp-based frames, the esp value may not stabilize to its home value
+    // until after all the fixed-size alloca instructions have executed.  In
+    // this case, a stack adjustment is needed when accessing in-args in order
+    // to copy them into registers.
+    size_t StackAdjBytes = IsEbpBasedFrame ? 0 : -FixedAllocaSizeBytes;
+    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
+                           InArgsSizeBytes);
  }
  // Fill in stack offsets for locals.

--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -512,9 +512,16 @@ void TargetX8664::addProlog(CfgNode *Node) {
  if (SpillAreaSizeBytes)
    _sub(getPhysicalRegister(Traits::RegisterSet::Reg_esp),
         Ctx->getConstantInt32(SpillAreaSizeBytes));
+  // Account for alloca instructions with known frame offsets.
+  SpillAreaSizeBytes += FixedAllocaSizeBytes;
  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
+  // Initialize the stack adjustment so that after all the known-frame-offset
+  // alloca instructions are emitted, the stack adjustment will reach zero.
  resetStackAdjustment();
+  updateStackAdjustment(-FixedAllocaSizeBytes);
  // Fill in stack offsets for stack args, and copy args into registers for
  // those that were register-allocated. Args are pushed right to left, so
@@ -543,7 +550,13 @@ void TargetX8664::addProlog(CfgNode *Node) {
        continue;
      }
    }
-    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+    // For esp-based frames, the esp value may not stabilize to its home value
+    // until after all the fixed-size alloca instructions have executed.  In
+    // this case, a stack adjustment is needed when accessing in-args in order
+    // to copy them into registers.
+    size_t StackAdjBytes = IsEbpBasedFrame ? 0 : -FixedAllocaSizeBytes;
+    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
+                           InArgsSizeBytes);
  }
  // Fill in stack offsets for locals.

--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -128,7 +128,8 @@ public:
  }
  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
-                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);
+                              size_t BasicFrameOffset, size_t StackAdjBytes,
+                              size_t &InArgsSizeBytes);
  typename Traits::Address stackVarToAsmOperand(const Variable *Var) const;
  typename Traits::InstructionSet getInstructionSet() const {
@@ -675,6 +676,7 @@ protected:
  bool IsEbpBasedFrame = false;
  bool NeedsStackAlignment = false;
  size_t SpillAreaSizeBytes = 0;
+  size_t FixedAllocaSizeBytes = 0;
  std::array<llvm::SmallBitVector, IceType_NUM> TypeToRegisterSet;
  std::array<llvm::SmallBitVector, Traits::RegisterSet::Reg_NUM>
      RegisterAliases;

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -791,13 +791,16 @@ template <class Machine>
 void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg,
                                                    Variable *FramePtr,
                                                    size_t BasicFrameOffset,
+                                                    size_t StackAdjBytes,
                                                    size_t &InArgsSizeBytes) {
  if (!Traits::Is64Bit) {
    if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
      Variable *Lo = Arg64On32->getLo();
      Variable *Hi = Arg64On32->getHi();
-      finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+      finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
-      finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+                             InArgsSizeBytes);
+      finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
+                             InArgsSizeBytes);
      return;
    }
  }
@@ -810,7 +813,8 @@ void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg,
  if (Arg->hasReg()) {
    assert(Ty != IceType_i64 || Traits::Is64Bit);
    typename Traits::X86OperandMem *Mem = Traits::X86OperandMem::create(
-        Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset()));
+        Func, Ty, FramePtr,
+        Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
    if (isVectorType(Arg->getType())) {
      _movp(Arg, Mem);
    } else {
@@ -905,6 +909,7 @@ TargetX86Base<Machine>::getRegisterSet(RegSetMask Include,
 template <class Machine>
 void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) {
+  if (!Inst->getKnownFrameOffset())
    IsEbpBasedFrame = true;
  // Conservatively require the stack to be aligned. Some stack adjustment
  // operations implemented below assume that the stack is aligned before the
@@ -935,7 +940,12 @@ void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) {
          llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
    uint32_t Value = ConstantTotalSize->getValue();
    Value = Utils::applyAlignment(Value, Alignment);
+    if (Inst->getKnownFrameOffset()) {
+      _adjust_stack(Value);
+      FixedAllocaSizeBytes += Value;
+    } else {
      _sub(esp, Ctx->getConstantInt32(Value));
+    }
  } else {
    // Non-constant sizes need to be adjusted to the next highest multiple of
    // the required alignment at runtime.

--- a/tests_lit/llvm2ice_tests/align-spill-locations.ll
+++ b/tests_lit/llvm2ice_tests/align-spill-locations.ll
@@ -40,6 +40,8 @@ declare void @ForceXmmSpills()
 define <4 x i32> @align_global_vector_ebp_based(i32 %arg) {
 entry:
+  br label %eblock  ; Disable alloca optimization
+eblock:
  %alloc = alloca i8, i32 1, align 1
  %vec.global = insertelement <4 x i32> undef, i32 %arg, i32 0
  br label %block
@@ -55,6 +57,8 @@ block:
 define <4 x i32> @align_local_vector_ebp_based(i32 %arg) {
 entry:
+  br label %eblock  ; Disable alloca optimization
+eblock:
  %alloc = alloca i8, i32 1, align 1
  %vec.local = insertelement <4 x i32> undef, i32 %arg, i32 0
  call void @ForceXmmSpillsAndUseAlloca(i8* %alloc)

--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
@@ -221,3 +221,37 @@ entry:
 ; ARM32: add r0, r0, [[REG]]
 ; ARM32: and r0, r0, #-536870912 ; 0xe0000000
 ; ARM32: sub sp, sp, r0
+; Test that a simple alloca sequence doesn't trigger a frame pointer.
+define void @fixed_no_frameptr(i32 %arg) {
+entry:
+  %a1 = alloca i8, i32 8, align 4
+  %a2 = alloca i8, i32 12, align 4
+  %a3 = alloca i8, i32 16, align 4
+  %p1 = bitcast i8* %a1 to i32*
+  %p2 = bitcast i8* %a2 to i32*
+  %p3 = bitcast i8* %a3 to i32*
+  store i32 %arg, i32* %p1, align 1
+  store i32 %arg, i32* %p2, align 1
+  store i32 %arg, i32* %p3, align 1
+  ret void
+}
+; CHECK-LABEL: fixed_no_frameptr
+; CHECK-NOT:      mov     ebp,esp
+; Test that a more complex alloca sequence does trigger a frame pointer.
+define void @var_with_frameptr(i32 %arg) {
+entry:
+  %a1 = alloca i8, i32 8, align 4
+  %a2 = alloca i8, i32 12, align 4
+  %a3 = alloca i8, i32 %arg, align 4
+  %p1 = bitcast i8* %a1 to i32*
+  %p2 = bitcast i8* %a2 to i32*
+  %p3 = bitcast i8* %a3 to i32*
+  store i32 %arg, i32* %p1, align 1
+  store i32 %arg, i32* %p2, align 1
+  store i32 %arg, i32* %p3, align 1
+  ret void
+}
+; CHECK-LABEL: var_with_frameptr
+; CHECK:      mov     ebp,esp
--- a/tests_lit/llvm2ice_tests/ebp_args.ll
+++ b/tests_lit/llvm2ice_tests/ebp_args.ll
@@ -9,6 +9,8 @@ declare i32 @memcpy_helper2(i32 %buf, i32 %buf2, i32 %n)
 define i32 @memcpy_helper(i32 %buf, i32 %n) {
 entry:
+  br label %eblock  ; Disable alloca optimization
+eblock:
  %buf2 = alloca i8, i32 128, align 4
  %n.arg_trunc = trunc i32 %n to i8
  %arg.ext = zext i8 %n.arg_trunc to i32

--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -265,6 +265,8 @@ declare void @use_ptr(i32 %iptr)
 define i64 @test_atomic_rmw_add_64_alloca(i32 %iptr, i64 %v) {
 entry:
+  br label %eblock  ; Disable alloca optimization
+eblock:
  %alloca_ptr = alloca i8, i32 16, align 16
  %ptr = inttoptr i32 %iptr to i64*
  %old = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 6)
@@ -805,6 +807,8 @@ entry:
 ; used to manage the stack frame, so it cannot be used as a register either.
 define i64 @test_atomic_cmpxchg_64_alloca(i32 %iptr, i64 %expected, i64 %desired) {
 entry:
+  br label %eblock  ; Disable alloca optimization
+eblock:
  %alloca_ptr = alloca i8, i32 16, align 16
  %ptr = inttoptr i32 %iptr to i64*
  %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,