Add VPUSH/VPOP instructions to the ARM32 integrated assembler.

Also fixes the corresponding emit methods for vpush and vpop to match constraint that the maximum number of consecutive registers that can be pushed/popped is 16. BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4334 R=jpp@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/1532233002 .

Add VPUSH/VPOP instructions to the ARM32 integrated assembler.
a3c32146 · Karl Schimpf · 39f40204 · a3c32146 · a3c32146 · a3c32146
Commit a3c32146 authored Dec 18, 2015 by Karl Schimpf
6 changed files
--- a/src/DartARM32/assembler_arm.h
+++ b/src/DartARM32/assembler_arm.h
@@ -1320,7 +1320,9 @@ class Assembler : public ValueObject {
 #if 0
  // Added the following missing operations:
  //
-  // ARM32::AssemblerARM::uxt() (uxtb and uxth).
+  // ARM32::AssemblerARM32::uxt() (uxtb and uxth).
+  // ARM32::AssemblerARM32::vpop()
+  // ARM32::AssemblerARM32::vpush()
  // ARM32::AssemblerARM:rbit().
 #endif

--- a/src/IceAssemblerARM32.cpp
+++ b/src/IceAssemblerARM32.cpp
@@ -115,6 +115,9 @@ static constexpr IValueT kInstTypeDataImmediate = 1; // i.e. 001
 static constexpr IValueT kInstTypeMemImmediate = 2;  // i.e. 010
 static constexpr IValueT kInstTypeRegisterShift = 3; // i.e. 011
+// Limit on number of registers in a vpush/vpop.
+static constexpr SizeT VpushVpopMaxConsecRegs = 16;
 // Offset modifier to current PC for next instruction.  The offset is off by 8
 // due to the way the ARM CPUs read PC.
 static constexpr IOffsetT kPCReadOffset = 8;
@@ -199,6 +202,12 @@ IValueT getEncodedGPRegNum(const Variable *Var) {
                                        : RegARM32::getEncodedGPR(Reg);
 }
+IValueT getEncodedSRegNum(const Variable *Var) {
+  assert(Var->hasReg());
+  assert(RegARM32::isEncodedSReg(Var->getRegNum()));
+  return RegARM32::getEncodedSReg(Var->getRegNum());
+}
 // The way an operand is encoded into a sequence of bits in functions
 // encodeOperand and encodeAddress below.
 enum EncodedOperand {
@@ -1997,5 +2006,54 @@ void AssemblerARM32::uxt(const Operand *OpRd, const Operand *OpSrc0,
  emitSignExtend(Cond, UxtOpcode, OpRd, OpSrc0, UxtName);
 }
+void AssemblerARM32::emitVStackOp(CondARM32::Cond Cond, IValueT Opcode,
+                                  const Variable *OpBaseReg,
+                                  SizeT NumConsecRegs, const char *InstName) {
+  const IValueT BaseReg = getEncodedSRegNum(OpBaseReg);
+  const IValueT DLastBit = mask(BaseReg, 0, 1); // Last bit of base register.
+  const IValueT Rd = mask(BaseReg, 1, 4);       // Top 4 bits of base register.
+  assert(0 < NumConsecRegs);
+  assert(NumConsecRegs <= VpushVpopMaxConsecRegs);
+  assert((BaseReg + NumConsecRegs) <= RegARM32::getNumSRegs());
+  verifyCondDefined(Cond, InstName);
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  const IValueT Encoding = Opcode | (Cond << kConditionShift) | DLastBit |
+                           (Rd << kRdShift) | NumConsecRegs;
+  emitInst(Encoding);
+}
+void AssemblerARM32::vpop(const Variable *OpBaseReg, SizeT NumConsecRegs,
+                          CondARM32::Cond Cond) {
+  // Note: Current implementation assumes that OpBaseReg is defined using S
+  // registers. It doesn't implement the D register form.
+  //
+  // VPOP - ARM section A8.8.367, encoding A2:
+  //  vpop<c> <RegList>
+  //
+  // cccc11001D111101dddd1010iiiiiiii where cccc=Cond, ddddD=BaseReg, and
+  // iiiiiiii=NumConsecRegs.
+  constexpr const char *VpopName = "vpop";
+  constexpr IValueT VpopOpcode =
+      B27 | B26 | B23 | B21 | B20 | B19 | B18 | B16 | B11 | B9;
+  emitVStackOp(Cond, VpopOpcode, OpBaseReg, NumConsecRegs, VpopName);
+}
+void AssemblerARM32::vpush(const Variable *OpBaseReg, SizeT NumConsecRegs,
+                           CondARM32::Cond Cond) {
+  // Note: Current implementation assumes that OpBaseReg is defined using S
+  // registers. It doesn't implement the D register form.
+  //
+  // VPUSH - ARM section A8.8.368, encoding A2:
+  //   vpush<c> <RegList>
+  //
+  // cccc11010D101101dddd1010iiiiiiii where cccc=Cond, ddddD=BaseReg, and
+  // iiiiiiii=NumConsecRegs.
+  constexpr const char *VpushName = "vpush";
+  constexpr IValueT VpushOpcode =
+      B27 | B26 | B24 | B21 | B19 | B18 | B16 | B11 | B9;
+  emitVStackOp(Cond, VpushOpcode, OpBaseReg, NumConsecRegs, VpushName);
+}
 } // end of namespace ARM32
 } // end of namespace Ice
--- a/src/IceAssemblerARM32.h
+++ b/src/IceAssemblerARM32.h
@@ -318,6 +318,12 @@ public:
  // Implements uxtb/uxth depending on type of OpSrc0.
  void uxt(const Operand *OpRd, const Operand *OpSrc0, CondARM32::Cond Cond);
+  void vpop(const Variable *OpBaseReg, SizeT NumConsecRegs,
+            CondARM32::Cond Cond);
+  void vpush(const Variable *OpBaseReg, SizeT NumConsecRegs,
+             CondARM32::Cond Cond);
  static bool classof(const Assembler *Asm) {
    return Asm->getKind() == Asm_ARM32;
  }
@@ -414,6 +420,12 @@ private:
                      bool IsLoad, IValueT BaseReg, IValueT Registers,
                      const char *InstName);
+  // Pattern ccccxxxxxDxxxxxxddddxxxxiiiiiiii where cccc=Cond, ddddD=BaseReg,
+  // iiiiiiii=NumConsecRegs, and xxxxx0xxxxxx0000xxxx00000000=Opcode.
+  void emitVStackOp(CondARM32::Cond Cond, IValueT Opcode,
+                    const Variable *OpBaseReg, SizeT NumConsecRegs,
+                    const char *InstName);
  // Pattern cccc011100x1dddd1111mmmm0001nnn where cccc=Cond,
  // x=Opcode, dddd=Rd, nnnn=Rn, mmmm=Rm.
  void emitDivOp(CondARM32::Cond Cond, IValueT Opcode, IValueT Rd, IValueT Rn,

--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -27,6 +27,9 @@ namespace Ice {
 namespace {
+// maximum number of registers allowed in vpush/vpop.
+static constexpr SizeT VpushVpopMaxConsecRegs = 16;
 const struct TypeARM32Attributes_ {
  const char *WidthString;    // b, h, <blank>, or d
  const char *VecWidthString; // i8, i16, i32, f32, f64
@@ -1311,7 +1314,8 @@ template <> void InstARM32Uxt::emitIAS(const Cfg *Func) const {
 namespace {
-bool isAssignedConsecutiveRegisters(Variable *Before, Variable *After) {
+bool isAssignedConsecutiveRegisters(const Variable *Before,
+                                    const Variable *After) {
  assert(Before->hasReg());
  assert(After->hasReg());
  return Before->getRegNum() + 1 == After->getRegNum();
@@ -1380,21 +1384,30 @@ void InstARM32Pop::emit(const Cfg *Func) const {
 }
 void InstARM32Pop::emitIAS(const Cfg *Func) const {
+  // Pop can't be emitted if there are no registers to load. This should never
+  // happen, but if it does, we don't need to bring Subzero down -- we just skip
+  // emitting the pop instruction (and maybe emit a nop?) The assert() is here
+  // so that we can detect this error during development.
+  const SizeT DestSize = Dests.size();
+  if (DestSize == 0) {
+    assert(false && "Empty pop list");
+    return;
+  }
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const auto *Reg = llvm::cast<Variable>(Dests[0]);
+  if (isScalarIntegerType(Reg->getType())) {
+    // Pop GPR registers.
    SizeT IntegerCount = 0;
    ARM32::IValueT GPRegisters = 0;
    const Variable *LastDest = nullptr;
    for (const Variable *Var : Dests) {
-    if (!isScalarIntegerType(Var->getType()))
+      assert(Var->hasReg() && "pop only applies to registers");
-      // TODO(kschimpf) Implement vpush.
+      int32_t Reg = RegARM32::getEncodedGPR(Var->getRegNum());
-      return emitUsingTextFixup(Func);
-    assert((Var && Var->hasReg()) && "pop only applies to registers");
-    int32_t Reg = Var->getRegNum();
-    assert(Reg != RegARM32::Encoded_Not_GPR);
      LastDest = Var;
      GPRegisters |= (1 << Reg);
      ++IntegerCount;
    }
-  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
    switch (IntegerCount) {
    case 0:
      return;
@@ -1408,6 +1421,27 @@ void InstARM32Pop::emitIAS(const Cfg *Func) const {
      Asm->popList(GPRegisters, CondARM32::AL);
      break;
    }
+  } else {
+    // Pop vector/floating point registers.
+    const Variable *BaseReg = nullptr;
+    SizeT RegCount = 0;
+    for (const Variable *NextReg : Dests) {
+      if (BaseReg == nullptr) {
+        BaseReg = NextReg;
+        RegCount = 1;
+      } else if (RegCount < VpushVpopMaxConsecRegs &&
+                 isAssignedConsecutiveRegisters(Reg, NextReg)) {
+        ++RegCount;
+      } else {
+        Asm->vpop(BaseReg, RegCount, CondARM32::AL);
+        BaseReg = NextReg;
+        RegCount = 1;
+      }
+      Reg = NextReg;
+    }
+    if (RegCount)
+      Asm->vpop(BaseReg, RegCount, CondARM32::AL);
+  }
  if (Asm->needsTextFixup())
    emitUsingTextFixup(Func);
 }
@@ -1441,7 +1475,7 @@ void InstARM32Push::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
-  auto *Reg = llvm::cast<Variable>(getSrc(0));
+  const auto *Reg = llvm::cast<Variable>(getSrc(0));
  if (isScalarIntegerType(Reg->getType())) {
    // GPR push.
    Str << "\t"
@@ -1461,12 +1495,16 @@ void InstARM32Push::emit(const Cfg *Func) const {
         "vpush"
         "\t{";
  Reg->emit(Func);
+  SizeT RegCount = 1;
  for (SizeT i = 1; i < SrcSize; ++i) {
-    auto *NextReg = llvm::cast<Variable>(getSrc(i));
+    const auto *NextReg = llvm::cast<Variable>(getSrc(i));
-    if (isAssignedConsecutiveRegisters(Reg, NextReg)) {
+    if (RegCount < VpushVpopMaxConsecRegs &&
+        isAssignedConsecutiveRegisters(Reg, NextReg)) {
+      ++RegCount;
      Str << ", ";
    } else {
      startNextInst(Func);
+      RegCount = 1;
      Str << "}\n\t"
             "vpush"
             "\t{";
@@ -1478,22 +1516,31 @@ void InstARM32Push::emit(const Cfg *Func) const {
 }
 void InstARM32Push::emitIAS(const Cfg *Func) const {
+  // Push can't be emitted if there are no registers to save. This should never
+  // happen, but if it does, we don't need to bring Subzero down -- we just skip
+  // emitting the push instruction (and maybe emit a nop?) The assert() is here
+  // so that we can detect this error during development.
+  const SizeT SrcSize = getSrcSize();
+  if (SrcSize == 0) {
+    assert(false && "Empty push list");
+    return;
+  }
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const auto *Reg = llvm::cast<Variable>(getSrc(0));
+  if (isScalarIntegerType(Reg->getType())) {
+    // Push GPR registers.
    SizeT IntegerCount = 0;
    ARM32::IValueT GPRegisters = 0;
    const Variable *LastSrc = nullptr;
    for (SizeT Index = 0; Index < getSrcSize(); ++Index) {
-    if (!isScalarIntegerType(getSrc(Index)->getType()))
+      const auto *Var = llvm::cast<Variable>(getSrc(Index));
-      // TODO(kschimpf) Implement vpush.
+      int32_t Reg = RegARM32::getEncodedGPR(Var->getRegNum());
-      return emitUsingTextFixup(Func);
-    const auto *Var = llvm::dyn_cast<Variable>(getSrc(Index));
-    assert((Var && Var->hasReg()) && "push only applies to registers");
-    int32_t Reg = Var->getRegNum();
      assert(Reg != RegARM32::Encoded_Not_GPR);
      LastSrc = Var;
      GPRegisters |= (1 << Reg);
      ++IntegerCount;
    }
-  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
    switch (IntegerCount) {
    case 0:
      return;
@@ -1505,10 +1552,27 @@ void InstARM32Push::emitIAS(const Cfg *Func) const {
      break;
    }
    default:
-    // TODO(kschimpf) Implement pushList in assembler.
      Asm->pushList(GPRegisters, CondARM32::AL);
      break;
    }
+  } else {
+    // Push vector/Floating point registers.
+    const Variable *BaseReg = Reg;
+    SizeT RegCount = 1;
+    for (SizeT i = 1; i < SrcSize; ++i) {
+      const auto *NextReg = llvm::cast<Variable>(getSrc(i));
+      if (RegCount < VpushVpopMaxConsecRegs &&
+          isAssignedConsecutiveRegisters(Reg, NextReg)) {
+        ++RegCount;
+      } else {
+        Asm->vpush(BaseReg, RegCount, CondARM32::AL);
+        BaseReg = NextReg;
+        RegCount = 1;
+      }
+      Reg = NextReg;
+    }
+    Asm->vpush(BaseReg, RegCount, CondARM32::AL);
+  }
  if (Asm->needsTextFixup())
    emitUsingTextFixup(Func);
 }

--- a/src/IceRegistersARM32.h
+++ b/src/IceRegistersARM32.h
@@ -104,6 +104,14 @@ public:
    return Reg_I64PAIR_First <= RegNum && RegNum <= Reg_I64PAIR_Last;
  }
+  static inline bool isEncodedSReg(int32_t RegNum) {
+    return Reg_SREG_First <= RegNum && RegNum <= Reg_SREG_Last;
+  }
+  static inline SizeT getNumSRegs() {
+    return Reg_SREG_Last + 1 - Reg_SREG_First;
+  }
  static inline SRegister getEncodedSReg(int32_t RegNum) {
    assert(Reg_SREG_First <= RegNum);
    assert(RegNum <= Reg_SREG_Last);

--- a/tests_lit/assembler/arm32/vpush.ll
+++ b/tests_lit/assembler/arm32/vpush.ll
+; Show that we know how to translate vpush and vpop.
+; NOTE: We use -O2 because vpush/vpop only occur if optimized. Uses
+; simple call with double parameters to cause the insertion of
+; vpush/vpop.
+; REQUIRES: allow_dump
+; Compile using standalone assembler.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 \
+; RUN:   | FileCheck %s --check-prefix=ASM
+; Show bytes in assembled standalone code.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 | FileCheck %s --check-prefix=DIS
+; Compile using integrated assembler.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --args -O2 \
+; RUN:   | FileCheck %s --check-prefix=IASM
+; Show bytes in assembled integrated code.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 | FileCheck %s --check-prefix=DIS
+define internal double @testVpushVpop(double %v1, double %v2) {
+; ASM-LABEL: testVpushVpop:
+; DIS-LABEL: 00000000 <testVpushVpop>:
+; IASM-LABEL: testVpushVpop:
+entry:
+; ASM-NEXT: .LtestVpushVpop$entry:
+; IASM-NEXT: .LtestVpushVpop$entry:
+; ASM-NEXT:     vpush   {s28, s29, s30, s31}
+; DIS-NEXT:    0:       ed2dea04
+; IASM-NEXT: 	.byte 0x4
+; IASM-NEXT: 	.byte 0xea
+; IASM-NEXT: 	.byte 0x2d
+; IASM-NEXT: 	.byte 0xed
+; ASM-NEXT:     push    {lr}
+; DIS-NEXT:    4:       e52de004
+; IASM-NEXT: 	.byte 0x4
+; IASM-NEXT: 	.byte 0xe0
+; IASM-NEXT: 	.byte 0x2d
+; IASM-NEXT: 	.byte 0xe5
+; ASM-NEXT:     sub     sp, sp, #12
+; DIS-NEXT:    8:       e24dd00c
+; IASM-NEXT: 	.byte 0xc
+; IASM-NEXT: 	.byte 0xd0
+; IASM-NEXT: 	.byte 0x4d
+; IASM-NEXT: 	.byte 0xe2
+; ASM-NEXT:     vmov.f64        d15, d0
+; DIS-NEXT:    c:       eeb0fb40
+; IASM-NEXT: 	vmov.f64	d15, d0
+; ASM-NEXT:     vmov.f64        d14, d1
+; DIS-NEXT:   10:       eeb0eb41
+; IASM-NEXT: 	vmov.f64	d14, d1
+  call void @foo()
+; ASM-NEXT:     bl      foo
+; DIS-NEXT:   14:       ebfffffe
+; IASM-NEXT: 	bl	foo	@ .word ebfffffe
+  %res = fadd double %v1, %v2
+; ASM-NEXT:     vadd.f64        d15, d15, d14
+; DIS-NEXT:   18:       ee3ffb0e
+; IASM-NEXT: 	vadd.f64	d15, d15, d14
+; ASM-NEXT:     vmov.f64        d0, d15
+; DIS-NEXT:   1c:       eeb00b4f
+; IASM-NEXT: 	vmov.f64	d0, d15
+  ret double %res
+; ASM-NEXT:     add     sp, sp, #12
+; DIS-NEXT:   20:       e28dd00c
+; IASM-NEXT: 	.byte 0xc
+; IASM-NEXT: 	.byte 0xd0
+; IASM-NEXT: 	.byte 0x8d
+; IASM-NEXT: 	.byte 0xe2
+; ASM-NEXT:     pop     {lr}
+; ASM-NEXT:     # lr = def.pseudo 
+; DIS-NEXT:   24:       e49de004
+; IASM-NEXT: 	.byte 0x4
+; IASM-NEXT: 	.byte 0xe0
+; IASM-NEXT: 	.byte 0x9d
+; IASM-NEXT: 	.byte 0xe4
+; ASM-NEXT:     vpop    {s28, s29, s30, s31}
+; ASM-NEXT:     # s28 = def.pseudo 
+; ASM-NEXT:     # s29 = def.pseudo 
+; ASM-NEXT:     # s30 = def.pseudo 
+; ASM-NEXT:     # s31 = def.pseudo 
+; DIS-NEXT:   28:       ecbdea04
+; IASM-NEXT: 	.byte 0x4
+; IASM-NEXT: 	.byte 0xea
+; IASM-NEXT: 	.byte 0xbd
+; IASM-NEXT: 	.byte 0xec
+; ASM-NEXT:     bx      lr
+; DIS-NEXT:   2c:       e12fff1e
+; IASM-NEXT: 	.byte 0x1e
+; IASM-NEXT: 	.byte 0xff
+; IASM-NEXT: 	.byte 0x2f
+; IASM-NEXT: 	.byte 0xe1
+}
+define internal void @foo() {
+  ret void
+}