Fixes ARM32 VFP calling convetion.

Packs VFP arguments as tight as the ABI wants, and adds tests for float and double arguments. vector argument tests will come soon. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1348393002 .

Fixes ARM32 VFP calling convetion.
385351ba · John Porto · 52863b13 · 385351ba · 385351ba · 385351ba
Commit 385351ba authored Sep 16, 2015 by John Porto
5 changed files
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -538,26 +538,31 @@ void InstARM32Vmov::emitSingleDestMultiSource(const Cfg *Func) const {
  Src1->emit(Func);
 }

+namespace {
+bool isVariableWithoutRegister(const Operand *Op) {
+  if (const auto *OpV = llvm::dyn_cast<const Variable>(Op)) {
+    return !OpV->hasReg();
+  }
+  return false;
+}
+
+bool isMemoryAccess(Operand *Op) {
+  return isVariableWithoutRegister(Op) || llvm::isa<OperandARM32Mem>(Op);
+}
+} // end of anonymous namespace
+
 void InstARM32Vmov::emitSingleDestSingleSource(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;
  Ostream &Str = Func->getContext()->getStrEmit();
  Variable *Dest = getDest();
  if (Dest->hasReg()) {
-    IceString ActualOpcode = "vmov";
    Operand *Src0 = getSrc(0);
-    if (const auto *Src0V = llvm::dyn_cast<Variable>(Src0)) {
-      if (!Src0V->hasReg()) {
-        ActualOpcode = IceString("vldr");
-      }
-    } else {
-      if (llvm::isa<OperandARM32Mem>(Src0))
-        ActualOpcode = IceString("vldr");
-    }
+    const char *ActualOpcode = isMemoryAccess(Src0) ? "vldr" : "vmov";
    Str << "\t" << ActualOpcode << "\t";
-    getDest()->emit(Func);
+    Dest->emit(Func);
    Str << ", ";
-    getSrc(0)->emit(Func);
+    Src0->emit(Func);
  } else {
    Variable *Src0 = llvm::cast<Variable>(getSrc(0));
    assert(Src0->hasReg());
@@ -897,8 +902,8 @@ void InstARM32Str::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 2);
  Type Ty = getSrc(0)->getType();
-  Str << "\t"
-      << "str" << getWidthString(Ty) << getPredicate() << "\t";
+  const char *Opcode = isScalarFloatingType(Ty) ? "vstr" : "str";
+  Str << "\t" << Opcode << getWidthString(Ty) << getPredicate() << "\t";
  getSrc(0)->emit(Func);
  Str << ", ";
  getSrc(1)->emit(Func);

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -12,7 +12,6 @@
 /// entirely of the lowering sequence for each high-level instruction.
 ///
 //===----------------------------------------------------------------------===//
-
 #include "IceTargetLoweringARM32.h"

 #include "IceCfg.h"
@@ -465,39 +464,50 @@ bool TargetARM32::CallingConv::I32InReg(int32_t *Reg) {
 }

 bool TargetARM32::CallingConv::FPInReg(Type Ty, int32_t *Reg) {
-  if (NumFPRegUnits >= ARM32_MAX_FP_REG_UNITS)
+  if (!VFPRegsFree.any()) {
    return false;
+  }
+
  if (isVectorType(Ty)) {
-    NumFPRegUnits = Utils::applyAlignment(NumFPRegUnits, 4);
    // Q registers are declared in reverse order, so RegARM32::Reg_q0 >
-    // RegARM32::Reg_q1. Therefore, we need to subtract NumFPRegUnits from
-    // Reg_q0. Same thing goes for D registers.
+    // RegARM32::Reg_q1. Therefore, we need to subtract QRegStart from Reg_q0.
+    // Same thing goes for D registers.
    static_assert(RegARM32::Reg_q0 > RegARM32::Reg_q1,
                  "ARM32 Q registers are possibly declared incorrectly.");
-    *Reg = RegARM32::Reg_q0 - (NumFPRegUnits / 4);
-    NumFPRegUnits += 4;
-    // If this bumps us past the boundary, don't allocate to a register and
-    // leave any previously speculatively consumed registers as consumed.
-    if (NumFPRegUnits > ARM32_MAX_FP_REG_UNITS)
-      return false;
+
+    int32_t QRegStart = (VFPRegsFree & ValidV128Regs).find_first();
+    if (QRegStart >= 0) {
+      VFPRegsFree.reset(QRegStart, QRegStart + 4);
+      *Reg = RegARM32::Reg_q0 - (QRegStart / 4);
+      return true;
+    }
  } else if (Ty == IceType_f64) {
    static_assert(RegARM32::Reg_d0 > RegARM32::Reg_d1,
                  "ARM32 D registers are possibly declared incorrectly.");
-    NumFPRegUnits = Utils::applyAlignment(NumFPRegUnits, 2);
-    *Reg = RegARM32::Reg_d0 - (NumFPRegUnits / 2);
-    NumFPRegUnits += 2;
-    // If this bumps us past the boundary, don't allocate to a register and
-    // leave any previously speculatively consumed registers as consumed.
-    if (NumFPRegUnits > ARM32_MAX_FP_REG_UNITS)
-      return false;
+
+    int32_t DRegStart = (VFPRegsFree & ValidF64Regs).find_first();
+    if (DRegStart >= 0) {
+      VFPRegsFree.reset(DRegStart, DRegStart + 2);
+      *Reg = RegARM32::Reg_d0 - (DRegStart / 2);
+      return true;
+    }
  } else {
    static_assert(RegARM32::Reg_s0 < RegARM32::Reg_s1,
                  "ARM32 S registers are possibly declared incorrectly.");
+
    assert(Ty == IceType_f32);
-    *Reg = RegARM32::Reg_s0 + NumFPRegUnits;
-    ++NumFPRegUnits;
+    int32_t SReg = VFPRegsFree.find_first();
+    assert(SReg >= 0);
+    VFPRegsFree.reset(SReg);
+    *Reg = RegARM32::Reg_s0 + SReg;
+    return true;
  }
-  return true;
+
+  // Parameter allocation failed. From now on, every fp register must be placed
+  // on the stack. We clear VFRegsFree in case there are any "holes" from S and
+  // D registers.
+  VFPRegsFree.clear();
+  return false;
 }

 void TargetARM32::lowerArguments() {
@@ -2235,6 +2245,8 @@ void TargetARM32::lowerCast(const InstCast *Inst) {
      UnimplementedError(Func->getContext()->getFlags());
      break;
    case IceType_v4i32:
+      // avoid cryptic liveness errors
+      Context.insert(InstFakeDef::create(Func, Dest));
      UnimplementedError(Func->getContext()->getFlags());
      break;
    case IceType_v4f32:
@@ -2768,9 +2780,10 @@ void TargetARM32::lowerStore(const InstStore *Inst) {
    Variable *ValueLo = legalizeToReg(loOperand(Value));
    _str(ValueHi, llvm::cast<OperandARM32Mem>(hiOperand(NewAddr)));
    _str(ValueLo, llvm::cast<OperandARM32Mem>(loOperand(NewAddr)));
-  } else if (isVectorType(Ty)) {
-    UnimplementedError(Func->getContext()->getFlags());
  } else {
+    if (isVectorType(Ty)) {
+      UnimplementedError(Func->getContext()->getFlags());
+    }
    Variable *ValueR = legalizeToReg(Value);
    _str(ValueR, NewAddr);
  }
@@ -2832,7 +2845,10 @@ Variable *TargetARM32::makeVectorOfZeros(Type Ty, int32_t RegNum) {
 Variable *TargetARM32::copyToReg(Operand *Src, int32_t RegNum) {
  Type Ty = Src->getType();
  Variable *Reg = makeReg(Ty, RegNum);
-  if (isVectorType(Ty) || isFloatingType(Ty)) {
+  if (isVectorType(Ty)) {
+    // TODO(jpp): Src must be a register, or an address with base register.
+    _vmov(Reg, Src);
+  } else if (isFloatingType(Ty)) {
    _vmov(Reg, Src);
  } else {
    // Mov's Src operand can really only be the flexible second operand type or

--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -21,6 +21,8 @@
 #include "IceRegistersARM32.h"
 #include "IceTargetLowering.h"

+#include "llvm/ADT/SmallBitVector.h"
+
 namespace Ice {

 // Class encapsulating ARM cpu features / instruction set.
@@ -461,19 +463,34 @@ protected:
  /// Helper class that understands the Calling Convention and register
  /// assignments. The first few integer type parameters can use r0-r3,
  /// regardless of their position relative to the floating-point/vector
-  /// arguments in the argument list. Floating-point and vector arguments can
-  /// use q0-q3 (aka d0-d7, s0-s15). Technically, arguments that can start with
-  /// registers but extend beyond the available registers can be split between
-  /// the registers and the stack. However, this is typically for passing GPR
-  /// structs by value, and PNaCl transforms expand this out.
+  /// arguments in the argument list. Floating-point and vector arguments
+  /// can use q0-q3 (aka d0-d7, s0-s15). For more information on the topic,
+  /// see the ARM Architecture Procedure Calling Standards (AAPCS).
+  ///
+  /// Technically, arguments that can start with registers but extend beyond the
+  /// available registers can be split between the registers and the stack.
+  /// However, this is typically  for passing GPR structs by value, and PNaCl
+  /// transforms expand this out.
  ///
-  /// Also, at the point before the call, the stack must be aligned.
+  /// At (public) function entry, the stack must be 8-byte aligned.
  class CallingConv {
    CallingConv(const CallingConv &) = delete;
    CallingConv &operator=(const CallingConv &) = delete;

  public:
-    CallingConv() {}
+    CallingConv()
+        : VFPRegsFree(ARM32_MAX_FP_REG_UNITS, true),
+          ValidF64Regs(ARM32_MAX_FP_REG_UNITS),
+          ValidV128Regs(ARM32_MAX_FP_REG_UNITS) {
+      for (uint32_t i = 0; i < ARM32_MAX_FP_REG_UNITS; ++i) {
+        if ((i % 2) == 0) {
+          ValidF64Regs[i] = true;
+        }
+        if ((i % 4) == 0) {
+          ValidV128Regs[i] = true;
+        }
+      }
+    }
    ~CallingConv() = default;

    bool I64InRegs(std::pair<int32_t, int32_t> *Regs);
@@ -481,12 +498,14 @@ protected:
    bool FPInReg(Type Ty, int32_t *Reg);

    static constexpr uint32_t ARM32_MAX_GPR_ARG = 4;
-    // Units of S registers still available to S/D/Q arguments.
+    // TODO(jpp): comment.
    static constexpr uint32_t ARM32_MAX_FP_REG_UNITS = 16;

  private:
    uint32_t NumGPRRegsUsed = 0;
-    uint32_t NumFPRegUnits = 0;
+    llvm::SmallBitVector VFPRegsFree;
+    llvm::SmallBitVector ValidF64Regs;
+    llvm::SmallBitVector ValidV128Regs;
  };

 private:

--- a/tests_lit/llvm2ice_tests/fp.arm.call.ll
+++ b/tests_lit/llvm2ice_tests/fp.arm.call.ll
--- a/tests_lit/llvm2ice_tests/phi_invalid.test
+++ b/tests_lit/llvm2ice_tests/phi_invalid.test
@@ -2,6 +2,7 @@
 ; https://code.google.com/p/nativeclient/issues/detail?id=4304

 RUN: %p2i --expect-fail --tbc -i %p/Input/phi-invalid.tbc --insts 2>&1 \
+RUN:        --filetype=obj --args -o /dev/null \
 RUN:        | FileCheck --check-prefix=BADPHI %s

 ; BADPHI: Phi error: