Subzero ARM: lower alloca instruction.

Lower alloca in a way similar to x86. Subtract the stack and align if needed, then copy that stack address to dest. Sometimes use "bic" for the mask, sometimes use "and", depending on what fits better. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1156713003

Subzero ARM: lower alloca instruction.
55500dbc · Jan Voung · 3bfd99a3 · 55500dbc · 55500dbc · 55500dbc
Commit 55500dbc authored May 26, 2015 by Jan Voung
7 changed files
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -290,6 +290,7 @@ template <> const char *InstARM32Mov::Opcode = "mov";
 template <> const char *InstARM32Adc::Opcode = "adc";
 template <> const char *InstARM32Add::Opcode = "add";
 template <> const char *InstARM32And::Opcode = "and";
+template <> const char *InstARM32Bic::Opcode = "bic";
 template <> const char *InstARM32Eor::Opcode = "eor";
 template <> const char *InstARM32Lsl::Opcode = "lsl";
 template <> const char *InstARM32Mul::Opcode = "mul";

--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -252,6 +252,7 @@ public:
    Adc,
    Add,
    And,
+    Bic,
    Br,
    Call,
    Cmp,
@@ -510,6 +511,7 @@ private:
 typedef InstARM32ThreeAddrGPR<InstARM32::Adc> InstARM32Adc;
 typedef InstARM32ThreeAddrGPR<InstARM32::Add> InstARM32Add;
 typedef InstARM32ThreeAddrGPR<InstARM32::And> InstARM32And;
+typedef InstARM32ThreeAddrGPR<InstARM32::Bic> InstARM32Bic;
 typedef InstARM32ThreeAddrGPR<InstARM32::Eor> InstARM32Eor;
 typedef InstARM32ThreeAddrGPR<InstARM32::Lsl> InstARM32Lsl;
 typedef InstARM32ThreeAddrGPR<InstARM32::Mul> InstARM32Mul;

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -123,6 +123,9 @@ ICEINSTICMP_TABLE
 // The maximum number of arguments to pass in GPR registers.
 const uint32_t ARM32_MAX_GPR_ARG = 4;

+// Stack alignment
+const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16;
+
 } // end of anonymous namespace

 TargetARM32::TargetARM32(Cfg *Func)
@@ -607,8 +610,42 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
  // stack alignment is preserved after the alloca.  The stack alignment
  // restriction can be relaxed in some cases.
  NeedsStackAlignment = true;
-  (void)Inst;
-  UnimplementedError(Func->getContext()->getFlags());
+
+  // TODO(stichnot): minimize the number of adjustments of SP, etc.
+  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+  Variable *Dest = Inst->getDest();
+  uint32_t AlignmentParam = Inst->getAlignInBytes();
+  // For default align=0, set it to the real value 1, to avoid any
+  // bit-manipulation problems below.
+  AlignmentParam = std::max(AlignmentParam, 1u);
+
+  // LLVM enforces power of 2 alignment.
+  assert(llvm::isPowerOf2_32(AlignmentParam));
+  assert(llvm::isPowerOf2_32(ARM32_STACK_ALIGNMENT_BYTES));
+
+  uint32_t Alignment = std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
+  if (Alignment > ARM32_STACK_ALIGNMENT_BYTES) {
+    alignRegisterPow2(SP, Alignment);
+  }
+  Operand *TotalSize = Inst->getSizeInBytes();
+  if (const auto *ConstantTotalSize =
+          llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
+    uint32_t Value = ConstantTotalSize->getValue();
+    Value = Utils::applyAlignment(Value, Alignment);
+    Operand *SubAmount = legalize(Ctx->getConstantInt32(Value));
+    _sub(SP, SP, SubAmount);
+  } else {
+    // Non-constant sizes need to be adjusted to the next highest
+    // multiple of the required alignment at runtime.
+    TotalSize = legalize(TotalSize);
+    Variable *T = makeReg(IceType_i32);
+    _mov(T, TotalSize);
+    Operand *AddAmount = legalize(Ctx->getConstantInt32(Alignment - 1));
+    _add(T, T, AddAmount);
+    alignRegisterPow2(T, Alignment);
+    _sub(SP, SP, T);
+  }
+  _mov(Dest, SP);
 }

 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
@@ -1528,6 +1565,23 @@ Variable *TargetARM32::makeReg(Type Type, int32_t RegNum) {
  return Reg;
 }

+void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {
+  assert(llvm::isPowerOf2_32(Align));
+  uint32_t RotateAmt = 0;
+  uint32_t Immed_8;
+  Operand *Mask;
+  // Use AND or BIC to mask off the bits, depending on which immediate fits
+  // (if it fits at all). Assume Align is usually small, in which case BIC
+  // works better.
+  if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {
+    Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex);
+    _bic(Reg, Reg, Mask);
+  } else {
+    Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex);
+    _and(Reg, Reg, Mask);
+  }
+}
+
 void TargetARM32::postLower() {
  if (Ctx->getFlags().getOptLevel() == Opt_m1)
    return;

--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -118,6 +118,7 @@ protected:
  Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
  static Type stackSlotType();
  Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
+  void alignRegisterPow2(Variable *Reg, uint32_t Align);

  // Returns a vector in a register with the given constant entries.
  Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
@@ -148,6 +149,10 @@ protected:
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32And::create(Func, Dest, Src0, Src1, Pred));
  }
+  void _bic(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Bic::create(Func, Dest, Src0, Src1, Pred));
+  }
  void _br(CondARM32::Cond Condition, CfgNode *TargetTrue,
           CfgNode *TargetFalse) {
    Context.insert(

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -138,18 +138,10 @@ const uint32_t X86_LOG2_OF_MAX_STACK_SLOT_SIZE = 4;
 // The number of different NOP instructions
 const uint32_t X86_NUM_NOP_VARIANTS = 5;

-// Value and Alignment are in bytes.  Return Value adjusted to the next
-// highest multiple of Alignment.
-uint32_t applyAlignment(uint32_t Value, uint32_t Alignment) {
-  // power of 2
-  assert((Alignment & (Alignment - 1)) == 0);
-  return (Value + Alignment - 1) & -Alignment;
-}
-
 // Value is in bytes. Return Value adjusted to the next highest multiple
 // of the stack alignment.
 uint32_t applyStackAlignment(uint32_t Value) {
-  return applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
+  return Utils::applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
 }

 // In some cases, there are x-macros tables for both high-level and
@@ -957,7 +949,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
    assert(SpillAreaAlignmentBytes <= X86_STACK_ALIGNMENT_BYTES);
    uint32_t PaddingStart = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
    uint32_t SpillAreaStart =
-        applyAlignment(PaddingStart, SpillAreaAlignmentBytes);
+        Utils::applyAlignment(PaddingStart, SpillAreaAlignmentBytes);
    SpillAreaPaddingBytes = SpillAreaStart - PaddingStart;
    SpillAreaSizeBytes += SpillAreaPaddingBytes;
  }
@@ -968,7 +960,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
  if (LocalsSlotsAlignmentBytes) {
    assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
    GlobalsAndSubsequentPaddingSize =
-        applyAlignment(GlobalsSize, LocalsSlotsAlignmentBytes);
+        Utils::applyAlignment(GlobalsSize, LocalsSlotsAlignmentBytes);
    SpillAreaSizeBytes += GlobalsAndSubsequentPaddingSize - GlobalsSize;
  }

@@ -1261,7 +1253,7 @@ void TargetX8632::lowerAlloca(const InstAlloca *Inst) {
  // restriction can be relaxed in some cases.
  NeedsStackAlignment = true;

-  // TODO(sehr,stichnot): minimize the number of adjustments of esp, etc.
+  // TODO(stichnot): minimize the number of adjustments of esp, etc.
  Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
  Operand *TotalSize = legalize(Inst->getSizeInBytes());
  Variable *Dest = Inst->getDest();
@@ -1271,17 +1263,17 @@ void TargetX8632::lowerAlloca(const InstAlloca *Inst) {
  AlignmentParam = std::max(AlignmentParam, 1u);

  // LLVM enforces power of 2 alignment.
-  assert((AlignmentParam & (AlignmentParam - 1)) == 0);
-  assert((X86_STACK_ALIGNMENT_BYTES & (X86_STACK_ALIGNMENT_BYTES - 1)) == 0);
+  assert(llvm::isPowerOf2_32(AlignmentParam));
+  assert(llvm::isPowerOf2_32(X86_STACK_ALIGNMENT_BYTES));

  uint32_t Alignment = std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES);
  if (Alignment > X86_STACK_ALIGNMENT_BYTES) {
    _and(esp, Ctx->getConstantInt32(-Alignment));
  }
-  if (ConstantInteger32 *ConstantTotalSize =
+  if (const auto *ConstantTotalSize =
          llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
    uint32_t Value = ConstantTotalSize->getValue();
-    Value = applyAlignment(Value, Alignment);
+    Value = Utils::applyAlignment(Value, Alignment);
    _sub(esp, Ctx->getConstantInt32(Value));
  } else {
    // Non-constant sizes need to be adjusted to the next highest

--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -61,16 +61,27 @@ public:
    return IsUint(N, Value);
  }

+  // Return true if the addition X + Y will cause integer overflow for
+  // integers of type T.
  template <typename T> static inline bool WouldOverflowAdd(T X, T Y) {
    return ((X > 0 && Y > 0 && (X > std::numeric_limits<T>::max() - Y)) ||
            (X < 0 && Y < 0 && (X < std::numeric_limits<T>::min() - Y)));
  }

+  // Return true if X is already aligned by N, where N is a power of 2.
  template <typename T> static inline bool IsAligned(T X, intptr_t N) {
    assert(llvm::isPowerOf2_64(N));
    return (X & (N - 1)) == 0;
  }

+  // Return Value adjusted to the next highest multiple of Alignment.
+  static inline uint32_t applyAlignment(uint32_t Value, uint32_t Alignment) {
+    assert(llvm::isPowerOf2_32(Alignment));
+    return (Value + Alignment - 1) & -Alignment;
+  }
+
+  // Return amount which must be added to adjust Pos to the next highest
+  // multiple of Align.
  static inline uint64_t OffsetToAlignment(uint64_t Pos, uint64_t Align) {
    assert(llvm::isPowerOf2_64(Align));
    uint64_t Mod = Pos & (Align - 1);
@@ -79,6 +90,7 @@ public:
    return Align - Mod;
  }

+  // Rotate the value bit pattern to the left by shift bits.
  // Precondition: 0 <= shift < 32
  static inline uint32_t rotateLeft32(uint32_t value, uint32_t shift) {
    if (shift == 0)
@@ -86,6 +98,7 @@ public:
    return (value << shift) | (value >> (32 - shift));
  }

+  // Rotate the value bit pattern to the right by shift bits.
  static inline uint32_t rotateRight32(uint32_t value, uint32_t shift) {
    if (shift == 0)
      return value;

--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
 ; This is a basic test of the alloca instruction.

-; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
-; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
+; once enough infrastructure is in. Also, switch to --filetype=obj
+; when possible.
+; RUN: %if --need=target_ARM32 --command %p2i --filetype=asm --assemble \
+; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --command FileCheck --check-prefix ARM32 %s

 define void @fixed_416_align_16(i32 %n) {
 entry:
@@ -16,6 +28,10 @@ entry:
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1

+; ARM32-LABEL: fixed_416_align_16
+; ARM32:      sub sp, sp, #416
+; ARM32:      bl {{.*}} R_{{.*}}    f1
+
 define void @fixed_416_align_32(i32 %n) {
 entry:
  %array = alloca i8, i32 400, align 32
@@ -30,6 +46,12 @@ entry:
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1

+; ARM32-LABEL: fixed_416_align_32
+; ARM32:      bic sp, sp, #31
+; ARM32:      sub sp, sp, #416
+; ARM32:      bl {{.*}} R_{{.*}}    f1
+
+; Show that the amount to allocate will be rounded up.
 define void @fixed_351_align_16(i32 %n) {
 entry:
  %array = alloca i8, i32 351, align 16
@@ -43,6 +65,10 @@ entry:
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1

+; ARM32-LABEL: fixed_351_align_16
+; ARM32:      sub sp, sp, #352
+; ARM32:      bl {{.*}} R_{{.*}}    f1
+
 define void @fixed_351_align_32(i32 %n) {
 entry:
  %array = alloca i8, i32 351, align 32
@@ -57,8 +83,15 @@ entry:
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1

+; ARM32-LABEL: fixed_351_align_32
+; ARM32:      bic sp, sp, #31
+; ARM32:      sub sp, sp, #352
+; ARM32:      bl {{.*}} R_{{.*}}    f1
+
 declare void @f1(i32 %ignored)

+declare void @f2(i32 %ignored)
+
 define void @variable_n_align_16(i32 %n) {
 entry:
  %array = alloca i8, i32 %n, align 16
@@ -75,6 +108,12 @@ entry:
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f2

+; ARM32-LABEL: variable_n_align_16
+; ARM32:      add r0, r0, #15
+; ARM32:      bic r0, r0, #15
+; ARM32:      sub sp, sp, r0
+; ARM32:      bl {{.*}} R_{{.*}}    f2
+
 define void @variable_n_align_32(i32 %n) {
 entry:
  %array = alloca i8, i32 %n, align 32
@@ -93,6 +132,13 @@ entry:
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f2

+; ARM32-LABEL: variable_n_align_32
+; ARM32:      bic sp, sp, #31
+; ARM32:      add r0, r0, #31
+; ARM32:      bic r0, r0, #31
+; ARM32:      sub sp, sp, r0
+; ARM32:      bl {{.*}} R_{{.*}}    f2
+
 ; Test alloca with default (0) alignment.
 define void @align0(i32 %n) {
 entry:
@@ -106,4 +152,56 @@ entry:
 ; CHECK: and [[REG]],0xfffffff0
 ; CHECK: sub esp,[[REG]]

-declare void @f2(i32 %ignored)
\ No newline at end of file
+; ARM32-LABEL: align0
+; ARM32: add r0, r0, #15
+; ARM32: bic r0, r0, #15
+; ARM32: sub sp, sp, r0
+
+; Test a large alignment where a mask might not fit in an immediate
+; field of an instruction for some architectures.
+define void @align1MB(i32 %n) {
+entry:
+  %array = alloca i8, i32 %n, align 1048576
+  %__2 = ptrtoint i8* %array to i32
+  call void @f2(i32 %__2)
+  ret void
+}
+; CHECK-LABEL: align1MB
+; CHECK: and esp,0xfff00000
+; CHECK: add [[REG:.*]],0xfffff
+; CHECK: and [[REG]],0xfff00000
+; CHECK: sub esp,[[REG]]
+
+; ARM32-LABEL: align1MB
+; ARM32: movw [[REG:.*]], #0
+; ARM32: movt [[REG]], #65520 ; 0xfff0
+; ARM32: and sp, sp, [[REG]]
+; ARM32: movw [[REG2:.*]], #65535 ; 0xffff
+; ARM32: movt [[REG2]], #15
+; ARM32: add r0, r0, [[REG2]]
+; ARM32: movw [[REG3:.*]], #0
+; ARM32: movt [[REG3]], #65520 ; 0xfff0
+; ARM32: and r0, r0, [[REG3]]
+; ARM32: sub sp, sp, r0
+
+; Test a large alignment where a mask might still fit in an immediate
+; field of an instruction for some architectures.
+define void @align512MB(i32 %n) {
+entry:
+  %array = alloca i8, i32 %n, align 536870912
+  %__2 = ptrtoint i8* %array to i32
+  call void @f2(i32 %__2)
+  ret void
+}
+; CHECK-LABEL: align512MB
+; CHECK: and esp,0xe0000000
+; CHECK: add [[REG:.*]],0x1fffffff
+; CHECK: and [[REG]],0xe0000000
+; CHECK: sub esp,[[REG]]
+
+; ARM32-LABEL: align512MB
+; ARM32: and sp, sp, #-536870912 ; 0xe0000000
+; ARM32: mvn [[REG:.*]], #-536870912 ; 0xe0000000
+; ARM32: add r0, r0, [[REG]]
+; ARM32: and r0, r0, #-536870912 ; 0xe0000000
+; ARM32: sub sp, sp, r0