Fix vector load/stores in the ARM assembler.

Fixes emit() methods for load/store to specify the element size (affects alignment issues). Also adds assembler methods to generate the corresponding binary forms, and updates emitIAS() to call these assembler methods. BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4334 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1663053008 .

Fix vector load/stores in the ARM assembler.
18cce427 · Karl Schimpf · 6e8d3fae · 18cce427 · 18cce427 · 18cce427
Commit 18cce427 authored Feb 04, 2016 by Karl Schimpf
7 changed files
--- a/pydir/crosstest_generator.py
+++ b/pydir/crosstest_generator.py
@@ -66,9 +66,7 @@ def main():
    flat_attrs += v
  arch_flags = { 'x8632': [],
                 'x8664': [],
-                 # ARM doesn't have an ELF writer yet, and iasm does not
-                 # support sandboxing yet.
-                 'arm32': ['--filetype=asm'] }
+                 'arm32': [] }
  # all_keys is only used in the help text.
  all_keys = '; '.join([' '.join(targets), ' '.join(sandboxing),
                        ' '.join(opt_levels), ' '.join(flat_attrs)])

--- a/src/DartARM32/assembler_arm.h
+++ b/src/DartARM32/assembler_arm.h
@@ -1402,11 +1402,13 @@ class Assembler : public ValueObject {
 #if 0
  // Added the following missing operations:
  //
-  // ARM32::AssemblerARM32::uxt() (uxtb and uxth).
+  // ARM32::AssemblerARM32::uxt() (uxtb and uxth)
  // ARM32::AssemblerARM32::vpop()
  // ARM32::AssemblerARM32::vpush()
-  // ARM32::AssemblerARM:rbit().
-  // ARM32::AssemblerARM::veord()
+  // ARM32::AssemblerARM32:rbit()
+  // ARM32::AssemblerARM32::veord()
+  // ARM32::AssemblerARM32::vld1qr()
+  // ARM32::AssemblerARM32::vst1qr()
 #endif

  DISALLOW_ALLOCATION();

--- a/src/IceAssemblerARM32.cpp
+++ b/src/IceAssemblerARM32.cpp
@@ -2544,6 +2544,61 @@ void AssemblerARM32::vldrs(const Operand *OpSd, const Operand *OpAddress,
  emitInst(Encoding);
 }

+void AssemblerARM32::emitVMem1Op(IValueT Opcode, IValueT Dd, IValueT Rn,
+                                 IValueT Rm, DRegListSize NumDRegs,
+                                 size_t ElmtSize, IValueT Align,
+                                 const char *InstName) {
+  assert(Utils::IsAbsoluteUint(2, Align));
+  IValueT EncodedElmtSize;
+  switch (ElmtSize) {
+  default: {
+    std::string Buffer;
+    llvm::raw_string_ostream StrBuf(Buffer);
+    StrBuf << InstName << ": found invalid vector element size " << ElmtSize;
+    llvm::report_fatal_error(StrBuf.str());
+  }
+  case 8:
+    EncodedElmtSize = 0;
+    break;
+  case 16:
+    EncodedElmtSize = 1;
+    break;
+  case 32:
+    EncodedElmtSize = 2;
+    break;
+  case 64:
+    EncodedElmtSize = 3;
+  }
+  const IValueT Encoding =
+      Opcode | (encodeCondition(CondARM32::kNone) << kConditionShift) |
+      (getYInRegYXXXX(Dd) << 22) | (Rn << kRnShift) |
+      (getXXXXInRegYXXXX(Dd) << kRdShift) | (NumDRegs << 8) |
+      (EncodedElmtSize << 6) | (Align << 4) | Rm;
+  emitInst(Encoding);
+}
+
+void AssemblerARM32::vld1qr(size_t ElmtSize, const Operand *OpQd,
+                            const Operand *OpAddress, const TargetInfo &TInfo) {
+  // VLD1 (multiple single elements) - ARM section A8.8.320, encoding A1:
+  //   vld1.<size> <Qd>, [<Rn>]
+  //
+  // 111101000D10nnnnddd0ttttssaammmm where tttt=DRegListSize2, Dddd=Qd,
+  // nnnn=Rn, aa=0 (use default alignment), size=ElmtSize, and ss is the
+  // encoding of ElmtSize.
+  constexpr const char *Vld1qr = "vld1qr";
+  const IValueT Qd = encodeQRegister(OpQd, "Qd", Vld1qr);
+  const IValueT Dd = mapQRegToDReg(Qd);
+  IValueT Address;
+  if (encodeAddress(OpAddress, Address, TInfo, NoImmOffsetAddress) !=
+      EncodedAsImmRegOffset)
+    llvm::report_fatal_error(std::string(Vld1qr) + ": malform memory address");
+  const IValueT Rn = mask(Address, kRnShift, 4);
+  constexpr IValueT Rm = RegARM32::Reg_pc;
+  constexpr IValueT Opcode = B26 | B21;
+  constexpr IValueT Align = 0; // use default alignment.
+  emitVMem1Op(Opcode, Dd, Rn, Rm, DRegListSize2, ElmtSize, Align, Vld1qr);
+}
+
 void AssemblerARM32::vmovd(const Operand *OpDd,
                           const OperandARM32FlexFpImm *OpFpImm,
                           CondARM32::Cond Cond) {
@@ -2858,6 +2913,28 @@ void AssemblerARM32::vstrs(const Operand *OpSd, const Operand *OpAddress,
  emitInst(Encoding);
 }

+void AssemblerARM32::vst1qr(size_t ElmtSize, const Operand *OpQd,
+                            const Operand *OpAddress, const TargetInfo &TInfo) {
+  // VST1 (multiple single elements) - ARM section A8.8.404, encoding A1:
+  //   vst1.<size> <Qd>, [<Rn>]
+  //
+  // 111101000D00nnnnddd0ttttssaammmm where tttt=DRegListSize2, Dddd=Qd,
+  // nnnn=Rn, aa=0 (use default alignment), size=ElmtSize, and ss is the
+  // encoding of ElmtSize.
+  constexpr const char *Vst1qr = "vst1qr";
+  const IValueT Qd = encodeQRegister(OpQd, "Qd", Vst1qr);
+  const IValueT Dd = mapQRegToDReg(Qd);
+  IValueT Address;
+  if (encodeAddress(OpAddress, Address, TInfo, NoImmOffsetAddress) !=
+      EncodedAsImmRegOffset)
+    llvm::report_fatal_error(std::string(Vst1qr) + ": malform memory address");
+  const IValueT Rn = mask(Address, kRnShift, 4);
+  constexpr IValueT Rm = RegARM32::Reg_pc;
+  constexpr IValueT Opcode = B26;
+  constexpr IValueT Align = 0; // use default alignment.
+  emitVMem1Op(Opcode, Dd, Rn, Rm, DRegListSize2, ElmtSize, Align, Vst1qr);
+}
+
 void AssemblerARM32::vsubs(const Operand *OpSd, const Operand *OpSn,
                           const Operand *OpSm, CondARM32::Cond Cond) {
  // VSUB (floating-point) - ARM section A8.8.415, encoding A2:

--- a/src/IceAssemblerARM32.h
+++ b/src/IceAssemblerARM32.h
@@ -73,6 +73,14 @@ public:
    kRotate24    // ror #24
  };

+  // Encoding of the number of D registers in a list of D registers.
+  enum DRegListSize {
+    DRegListSize1 = 7,  // 0b0111
+    DRegListSize2 = 10, // 0b1010
+    DRegListSIze3 = 6,  // 0b0110
+    DRegListSize4 = 2   // 0b0010
+  };
+
  class TargetInfo {
    TargetInfo(const TargetInfo &) = delete;
    TargetInfo &operator=(const TargetInfo &) = delete;
@@ -399,6 +407,16 @@ public:
    vldrs(OpSd, OpAddress, Cond, TInfo);
  }

+  // ElmtSize = #bits in vector element.
+  void vld1qr(size_t ElmtSize, const Operand *OpQd, const Operand *OpRn,
+              const TargetInfo &TInfo);
+
+  void vld1qr(size_t ElmtSize, const Operand *OpQd, const Operand *OpRn,
+              const TargetLowering *Lowering) {
+    const TargetInfo TInfo(Lowering);
+    vld1qr(ElmtSize, OpQd, OpRn, TInfo);
+  }
+
  void vmovd(const Operand *OpDn, const OperandARM32FlexFpImm *OpFpImm,
             CondARM32::Cond Cond);

@@ -477,6 +495,16 @@ public:
    vstrs(OpSd, OpAddress, Cond, TInfo);
  }

+  // ElmtSize = #bits in vector element.
+  void vst1qr(size_t ElmtSize, const Operand *OpQd, const Operand *OpAddress,
+              const TargetInfo &TInfo);
+
+  void vst1qr(size_t ElmtSize, const Operand *OpQd, const Operand *OpRn,
+              const TargetLowering *Lowering) {
+    const TargetInfo TInfo(Lowering);
+    vst1qr(ElmtSize, OpQd, OpRn, TInfo);
+  }
+
  void vsubd(const Operand *OpDd, const Operand *OpDn, const Operand *OpDm,
             CondARM32::Cond Cond);

@@ -601,6 +629,13 @@ private:
  // mmmmM=Sm, and xx0xxxxxxdddd000xxx0x0000=Opcode.
  void emitVFPds(CondARM32::Cond Cond, IValueT Opcode, IValueT Dd, IValueT Sm);

+  // Pattern 111100000D00nnnnddddttttssaammmm | Opcode where Ddddd=Dd, nnnn=Rn,
+  // mmmmm=Rm, tttt=NumDRegs, ElmtSize in {8, 16, 32, 64) and defines ss, and
+  // aa=Align.
+  void emitVMem1Op(IValueT Opcode, IValueT Dd, IValueT Rn, IValueT Rm,
+                   DRegListSize NumDRegs, size_t ElmtSize, IValueT Align,
+                   const char *InstName);
+
  // Pattern cccc011100x1dddd1111mmmm0001nnn where cccc=Cond,
  // x=Opcode, dddd=Rd, nnnn=Rn, mmmm=Rm.
  void emitDivOp(CondARM32::Cond Cond, IValueT Opcode, IValueT Rd, IValueT Rn,

--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -65,6 +65,10 @@ const struct InstARM32CondAttributes_ {
 #undef X
 };

+size_t getVecElmtBitsize(Type Ty) {
+  return typeWidthInBytes(typeElementType(Ty)) * CHAR_BIT;
+}
+
 } // end of anonymous namespace

 const char *InstARM32::getWidthString(Type Ty) {
@@ -1563,7 +1567,6 @@ template <> void InstARM32Ldr::emit(const Cfg *Func) const {
  const bool IsScalarFloat = isScalarFloatingType(Ty);
  const char *ActualOpcode =
      IsVector ? "vld1" : (IsScalarFloat ? "vldr" : "ldr");
-  const char *VectorMarker = IsVector ? ".64" : "";
  const char *WidthString = IsVector ? "" : getWidthString(Ty);
  Str << "\t" << ActualOpcode;
  const bool IsVInst = IsVector || IsScalarFloat;
@@ -1572,7 +1575,9 @@ template <> void InstARM32Ldr::emit(const Cfg *Func) const {
  } else {
    Str << WidthString << getPredicate();
  }
-  Str << VectorMarker << "\t";
+  if (IsVector)
+    Str << "." << getVecElmtBitsize(Ty);
+  Str << "\t";
  getDest()->emit(Func);
  Str << ", ";
  getSrc(0)->emit(Func);
@@ -1580,29 +1585,32 @@ template <> void InstARM32Ldr::emit(const Cfg *Func) const {

 template <> void InstARM32Ldr::emitIAS(const Cfg *Func) const {
  assert(getSrcSize() == 1);
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
  Variable *Dest = getDest();
  const Type DestTy = Dest->getType();
-  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
-  if (isScalarFloatingType(DestTy)) {
-    switch (DestTy) {
-    default:
-      // TODO(kschimpf) Does this happen?
-      Asm->setNeedsTextFixup();
-      break;
-    case IceType_f32:
-      Asm->vldrs(Dest, getSrc(0), getPredicate(), Func->getTarget());
-      break;
-    case IceType_f64:
-      Asm->vldrd(Dest, getSrc(0), getPredicate(), Func->getTarget());
-      break;
-    }
-  } else if (isVectorType(DestTy))
-    // TODO(kschimpf) Handle case.
-    Asm->setNeedsTextFixup();
-  else
+  switch (DestTy) {
+  default:
+    llvm::report_fatal_error("Ldr on unknown type: " + typeIceString(DestTy));
+  case IceType_i1:
+  case IceType_i8:
+  case IceType_i16:
+  case IceType_i32:
+  case IceType_i64:
    Asm->ldr(Dest, getSrc(0), getPredicate(), Func->getTarget());
-  if (Asm->needsTextFixup())
-    emitUsingTextFixup(Func);
+    break;
+  case IceType_f32:
+    Asm->vldrs(Dest, getSrc(0), getPredicate(), Func->getTarget());
+    break;
+  case IceType_f64:
+    Asm->vldrd(Dest, getSrc(0), getPredicate(), Func->getTarget());
+    break;
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32:
+    Asm->vld1qr(getVecElmtBitsize(DestTy), Dest, getSrc(0), Func->getTarget());
+    break;
+  }
 }

 template <> void InstARM32Ldrex::emit(const Cfg *Func) const {
@@ -1898,7 +1906,6 @@ void InstARM32Str::emit(const Cfg *Func) const {
  const bool IsScalarFloat = isScalarFloatingType(Ty);
  const char *Opcode =
      IsVectorStore ? "vst1" : (IsScalarFloat ? "vstr" : "str");
-  const char *VecEltWidthString = IsVectorStore ? ".64" : "";
  Str << "\t" << Opcode;
  const bool IsVInst = IsVectorStore || IsScalarFloat;
  if (IsVInst) {
@@ -1906,7 +1913,9 @@ void InstARM32Str::emit(const Cfg *Func) const {
  } else {
    Str << getWidthString(Ty) << getPredicate();
  }
-  Str << VecEltWidthString << "\t";
+  if (IsVectorStore)
+    Str << "." << getVecElmtBitsize(Ty);
+  Str << "\t";
  getSrc(0)->emit(Func);
  Str << ", ";
  getSrc(1)->emit(Func);
@@ -1914,28 +1923,33 @@ void InstARM32Str::emit(const Cfg *Func) const {

 void InstARM32Str::emitIAS(const Cfg *Func) const {
  assert(getSrcSize() == 2);
-  Type Ty = getSrc(0)->getType();
  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
-  if (isScalarFloatingType(Ty)) {
-    switch (Ty) {
-    default:
-      // TODO(kschimpf) Does this happen?
-      Asm->setNeedsTextFixup();
-      break;
-    case IceType_f32:
-      Asm->vstrs(getSrc(0), getSrc(1), getPredicate(), Func->getTarget());
-      break;
-    case IceType_f64:
-      Asm->vstrd(getSrc(0), getSrc(1), getPredicate(), Func->getTarget());
-      break;
-    }
-  } else if (isVectorType(Ty))
-    // TODO(kschimpf) Handle case.
-    Asm->setNeedsTextFixup();
-  else
-    Asm->str(getSrc(0), getSrc(1), getPredicate(), Func->getTarget());
-  if (Asm->needsTextFixup())
-    emitUsingTextFixup(Func);
+  const Operand *Src0 = getSrc(0);
+  const Operand *Src1 = getSrc(1);
+  Type Ty = Src0->getType();
+  switch (Ty) {
+  default:
+    llvm::report_fatal_error("Str on unknown type: " + typeIceString(Ty));
+  case IceType_i1:
+  case IceType_i8:
+  case IceType_i16:
+  case IceType_i32:
+  case IceType_i64:
+    Asm->str(Src0, Src1, getPredicate(), Func->getTarget());
+    break;
+  case IceType_f32:
+    Asm->vstrs(Src0, Src1, getPredicate(), Func->getTarget());
+    break;
+  case IceType_f64:
+    Asm->vstrd(Src0, Src1, getPredicate(), Func->getTarget());
+    break;
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32:
+    Asm->vst1qr(getVecElmtBitsize(Ty), Src0, Src1, Func->getTarget());
+    break;
+  }
 }

 void InstARM32Str::dump(const Cfg *Func) const {

--- a/tests_lit/assembler/arm32/vldr-vector.ll
+++ b/tests_lit/assembler/arm32/vldr-vector.ll
 ; Show that we know how to translate vector load instructions.

+; Note: Uses -O2 to remove unnecessary loads/stores, resulting in only one VLD1
+; instruction per function.
+
 ; REQUIRES: allow_dump

 ; Compile using standalone assembler.
 ; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 \
+; RUN:   -reg-use=q11,r5 \
 ; RUN:   | FileCheck %s --check-prefix=ASM

 ; Show bytes in assembled standalone code.
 ; RUN: %p2i --filetype=asm -i %s --target=arm32 --assemble --disassemble \
 ; RUN:   --args -O2 \
+; RUN:   -reg-use=q11,r5 \
 ; RUN:   | FileCheck %s --check-prefix=DIS

 ; Compile using integrated assembler.
 ; RUN: %p2i --filetype=iasm -i %s --target=arm32 --args -O2 \
+; RUN:   -reg-use=q11,r5 \
 ; RUN:   | FileCheck %s --check-prefix=IASM

 ; Show bytes in assembled integrated code.
 ; RUN: %p2i --filetype=iasm -i %s --target=arm32 --assemble --disassemble \
 ; RUN:   --args -O2 \
+; RUN:   -reg-use=q11,r5 \
 ; RUN:   | FileCheck %s --check-prefix=DIS

 define internal <4 x float> @testDerefFloat4(<4 x float> *%p) {
 ; ASM-LABEL: testDerefFloat4:
-; DIS-LABEL: 00000000 <testDerefFloat4>:
+; DIS-LABEL: {{.+}} <testDerefFloat4>:
 ; IASM-LABEL: testDerefFloat4:

 entry:
  %ret = load <4 x float>, <4 x float>* %p, align 4
-; ASM:     vld1.64	q0, [r0]
-; DIS:   0:       f4200acf
+; ASM:   vld1.32        q11, [r5]
+; DIS:   {{.*}}:        f4656a8f
+; IASM-NOT:  vld1.32

  ret <4 x float> %ret
 }

 define internal <4 x i32> @testDeref4i32(<4 x i32> *%p) {
 ; ASM-LABEL: testDeref4i32:
-; DIS-LABEL: 00000010 <testDeref4i32>:
+; DIS-LABEL: {{.+}} <testDeref4i32>:
 ; IASM-LABEL: testDeref4i32:

 entry:
  %ret = load <4 x i32>, <4 x i32>* %p, align 4
-; ASM:     vld1.64	q0, [r0]
-; DIS:   10:       f4200acf
+; ASM:   vld1.32        q11, [r5]
+; DIS:   {{.+}}:        f4656a8f
+; IASM-NOT:  vld1.32

  ret <4 x i32> %ret
 }

 define internal <8 x i16> @testDeref8i16(<8 x i16> *%p) {
 ; ASM-LABEL: testDeref8i16:
-; DIS-LABEL: 00000020 <testDeref8i16>:
+; DIS-LABEL: {{.+}} <testDeref8i16>:
 ; IASM-LABEL: testDeref8i16:

 entry:
  %ret = load <8 x i16>, <8 x i16>* %p, align 2
-; ASM:     vld1.64	q0, [r0]
-; DIS:   20:       f4200acf
+; ASM:   vld1.16        q11, [r5]
+; DIS:   {{.+}}:        f4656a4f
+; IASM-NOT:  vld1.16

  ret <8 x i16> %ret
 }

 define internal <16 x i8> @testDeref16i8(<16 x i8> *%p) {
 ; ASM-LABEL: testDeref16i8:
-; DIS-LABEL: 00000030 <testDeref16i8>:
+; DIS-LABEL: {{.+}} <testDeref16i8>:
 ; IASM-LABEL: testDeref16i8:

 entry:
  %ret = load <16 x i8>, <16 x i8>* %p, align 1
-; ASM:     vld1.64	q0, [r0]
-; DIS:   30:       f4200acf
+; ASM:   vld1.8         q11, [r5]
+; DIS:   {{.+}}:        f4656a0f
+; IASM-NOT:  vld1.8

  ret <16 x i8> %ret
 }
--- a/tests_lit/assembler/arm32/vstr-vector.ll
+++ b/tests_lit/assembler/arm32/vstr-vector.ll
+; Show that we know how to translate vector store instructions.
+
+; Note: Uses -O2 to remove unnecessary loads/stores, resulting in only one VST1
+; instruction per function.
+
+; REQUIRES: allow_dump
+
+; Compile using standalone assembler.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 \
+; RUN:   -reg-use=q11,r5 \
+; RUN:   | FileCheck %s --check-prefix=ASM
+
+; Show bytes in assembled standalone code.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 \
+; RUN:   -reg-use=q11,r5 \
+; RUN:   | FileCheck %s --check-prefix=DIS
+
+; Compile using integrated assembler.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --args -O2 \
+; RUN:   -reg-use=q11,r5 \
+; RUN:   | FileCheck %s --check-prefix=IASM
+
+; Show bytes in assembled integrated code.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 \
+; RUN:   -reg-use=q11,r5 \
+; RUN:   | FileCheck %s --check-prefix=DIS
+
+define internal void @testDerefFloat4(<4 x float>* %p, <4 x float> %v) {
+; ASM-LABEL: testDerefFloat4:
+; DIS-LABEL: {{.+}} <testDerefFloat4>:
+
+entry:
+  store <4 x float> %v, <4 x float>* %p, align 4
+; ASM:    vst1.32   q11, [r5]
+; DIS:    {{.+}}:   f4456a8f
+; IASM-NOT:   vst1.32
+
+  ret void
+}
+
+define internal void @testDeref4i32(<4 x i32> *%p, <4 x i32> %v) {
+; ASM-LABEL: testDeref4i32:
+; DIS-LABEL: {{.+}} <testDeref4i32>:
+
+entry:
+  store <4 x i32> %v, <4 x i32>* %p, align 4
+; ASM:   vst1.32  q11, [r5]
+; DIS:   {{.+}}:  f4456a8f
+; IASM-NOT:   vst1.32
+
+  ret void
+}
+
+define internal void @testDeref8i16(<8 x i16> *%p, <8 x i16> %v) {
+; ASM-LABEl: testDeref8i16:
+; DIS-LABEL: {{.+}} <testDeref8i16>:
+
+  store <8 x i16> %v, <8 x i16>* %p, align 2
+; ASM:   vst1.16  q11, [r5]
+; DIS:   {{.+}}:  f4456a4f
+; IASM-NOT:   vst1.16
+
+  ret void
+}
+
+define internal void @testDeref16i8(<16 x i8> *%p, <16 x i8> %v) {
+; ASM-LABEL: testDeref16i8:
+; DIS-LABEL: {{.+}} <testDeref16i8>:
+
+  store <16 x i8> %v, <16 x i8>* %p, align 1
+; ASM:   vst1.8   q11, [r5]
+; DIS:   {{.+}}:  f4456a0f
+; IASM-NOT:   vst1.8
+
+  ret void
+}