Add the VMLS instruction to the integrated ARM assembler.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4334 R=jpp@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/1642303002 .

Add the VMLS instruction to the integrated ARM assembler.
694cdbd8 · Karl Schimpf · b40595a1 · 694cdbd8 · 694cdbd8 · 694cdbd8
Commit 694cdbd8 authored Jan 29, 2016 by Karl Schimpf
6 changed files
--- a/src/DartARM32/assembler_arm.cc
+++ b/src/DartARM32/assembler_arm.cc
@@ -1012,21 +1012,19 @@ void Assembler::vmlad(DRegister dd, DRegister dn, DRegister dm,
                      Condition cond) {
  EmitVFPddd(cond, 0, dd, dn, dm);
 }
-#endif
+// Moved to Arm32::AssemblerARM32::vmlss()
 void Assembler::vmlss(SRegister sd, SRegister sn, SRegister sm,
                      Condition cond) {
  EmitVFPsss(cond, B6, sd, sn, sm);
 }
+// Moved to Arm32::AssemblerARM32::vmlsd()
 void Assembler::vmlsd(DRegister dd, DRegister dn, DRegister dm,
                      Condition cond) {
  EmitVFPddd(cond, B6, dd, dn, dm);
 }
-#if 0
 // Moved to Arm32::AssemblerARM32::vdivs()
 void Assembler::vdivs(SRegister sd, SRegister sn, SRegister sm,
                      Condition cond) {

--- a/src/DartARM32/assembler_arm.h
+++ b/src/DartARM32/assembler_arm.h
@@ -696,10 +696,10 @@ class Assembler : public ValueObject {
  void vmlas(SRegister sd, SRegister sn, SRegister sm, Condition cond = AL);
  // Moved to Arm32::AssemblerARM32::vmlad()
  void vmlad(DRegister dd, DRegister dn, DRegister dm, Condition cond = AL);
-#endif
+  // Moved to Arm32::AssemblerARM32::vmlss()
  void vmlss(SRegister sd, SRegister sn, SRegister sm, Condition cond = AL);
+  // Moved to Arm32::AssemblerARM32::vmlsd()
  void vmlsd(DRegister dd, DRegister dn, DRegister dm, Condition cond = AL);
-#if 0
  // Moved to Arm32::AssemblerARM32::vdivs()
  void vdivs(SRegister sd, SRegister sn, SRegister sm, Condition cond = AL);
  // Moved to Arm32::AssemblerARM32::vdivd()

--- a/src/IceAssemblerARM32.cpp
+++ b/src/IceAssemblerARM32.cpp
@@ -2520,6 +2520,30 @@ void AssemblerARM32::vmlas(const Operand *OpSd, const Operand *OpSn,
  emitVFPsss(Cond, VmlasOpcode, OpSd, OpSn, OpSm, Vmlas);
 }
+void AssemblerARM32::vmlsd(const Operand *OpDd, const Operand *OpDn,
+                           const Operand *OpDm, CondARM32::Cond Cond) {
+  // VMLA, VMLS (floating-point), ARM section A8.8.337, encoding A2:
+  //   vmls<c>.f64 <Dd>, <Dn>, <Dm>
+  //
+  // cccc11100d00nnnndddd1011n1M0mmmm where cccc=Cond, Ddddd=Dd, Nnnnn=Dn, and
+  // Mmmmm=Dm
+  constexpr const char *Vmlad = "vmlad";
+  constexpr IValueT VmladOpcode = B6;
+  emitVFPddd(Cond, VmladOpcode, OpDd, OpDn, OpDm, Vmlad);
+}
+void AssemblerARM32::vmlss(const Operand *OpSd, const Operand *OpSn,
+                           const Operand *OpSm, CondARM32::Cond Cond) {
+  // VMLA, VMLS (floating-point), ARM section A8.8.337, encoding A2:
+  //   vmls<c>.f32 <Sd>, <Sn>, <Sm>
+  //
+  // cccc11100d00nnnndddd1010n1M0mmmm where cccc=Cond, ddddD=Sd, nnnnN=Sn, and
+  // mmmmM=Sm
+  constexpr const char *Vmlas = "vmlas";
+  constexpr IValueT VmlasOpcode = B6;
+  emitVFPsss(Cond, VmlasOpcode, OpSd, OpSn, OpSm, Vmlas);
+}
 void AssemblerARM32::vmrsAPSR_nzcv(CondARM32::Cond Cond) {
  // MVRS - ARM section A*.8.348, encoding A1:
  //   vmrs<c> APSR_nzcv, FPSCR

--- a/src/IceAssemblerARM32.h
+++ b/src/IceAssemblerARM32.h
@@ -402,6 +402,12 @@ public:
  void vmlas(const Operand *OpSd, const Operand *OpSn, const Operand *OpSm,
             CondARM32::Cond Cond);
+  void vmlsd(const Operand *OpDd, const Operand *OpDn, const Operand *OpDm,
+             CondARM32::Cond Cond);
+  void vmlss(const Operand *OpSd, const Operand *OpSn, const Operand *OpSm,
+             CondARM32::Cond Cond);
  // Uses APSR_nzcv as register
  void vmrsAPSR_nzcv(CondARM32::Cond Cond);

--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -666,15 +666,36 @@ template <> void InstARM32Vmla::emitIAS(const Cfg *Func) const {
  default:
    // TODO(kschimpf) Figure out how vector operations apply.
    emitUsingTextFixup(Func);
-    break;
+    return;
  case IceType_f32:
    Asm->vmlas(getDest(), getSrc(1), getSrc(2), CondARM32::AL);
    assert(!Asm->needsTextFixup());
-    break;
+    return;
  case IceType_f64:
    Asm->vmlad(getDest(), getSrc(1), getSrc(2), CondARM32::AL);
    assert(!Asm->needsTextFixup());
-    break;
+    return;
+  }
+}
+template <> void InstARM32Vmls::emitIAS(const Cfg *Func) const {
+  // Note: Dest == getSrc(0) for four address FP instructions.
+  assert(getSrcSize() == 3);
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    // TODO(kschimpf) Figure out how vector operations apply.
+    emitUsingTextFixup(Func);
+    return;
+  case IceType_f32:
+    Asm->vmlss(getDest(), getSrc(1), getSrc(2), CondARM32::AL);
+    assert(!Asm->needsTextFixup());
+    return;
+  case IceType_f64:
+    Asm->vmlsd(getDest(), getSrc(1), getSrc(2), CondARM32::AL);
+    assert(!Asm->needsTextFixup());
+    return;
  }
 }

--- a/tests_lit/assembler/arm32/vmls.ll
+++ b/tests_lit/assembler/arm32/vmls.ll
+; Show that we can take advantage of the vmls instruction for floating point
+; operations during optimization.
+; Note that we use -O2 to force the result of the fmul to be
+; (immediately) available for the fsub. When using -Om1, the merge of
+; fmul and fsub does not happen.
+; REQUIRES: allow_dump
+; Compile using standalone assembler.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 \
+; RUN:   -reg-use=s20,s21,s22,d20,d21,d22 \
+; RUN:   | FileCheck %s --check-prefix=ASM
+; Show bytes in assembled standalone code.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 -reg-use=s20,s21,s22,d20,d21,d22 \
+; RUN:   | FileCheck %s --check-prefix=DIS
+; Compile using integrated assembler.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --args -O2 \
+; RUN:   -reg-use=s20,s21,s22,d20,d21,d22 \
+; RUN:   | FileCheck %s --check-prefix=IASM
+; Show bytes in assembled integrated code.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 -reg-use=s20,s21,s22,d20,d21,d22 \
+; RUN:   | FileCheck %s --check-prefix=DIS
+define internal float @mulSubFloat(float %f1, float %f2) {
+; ASM-LABEL: mulSubFloat:
+; DIS-LABEL: 00000000 <mulSubFloat>:
+  %v1 = fmul float %f1, 1.5
+  %v2 = fsub float %f2, %v1
+; ASM:  vmls.f32        s21, s20, s22
+; DIS:   10:    ee4aaa4b
+; IASM-NOT: vmls.f32
+  ret float %v2
+}
+define internal double @mulSubDouble(double %f1, double %f2) {
+; ASM-LABEL: mulSubDouble:
+; DIS-LABEL: 00000020 <mulSubDouble>:
+  %v1 = fmul double %f1, 1.5
+  %v2 = fsub double %f2, %v1
+; ASM:  vmls.f64        d21, d20, d22
+; DIS:   2c:    ee445be6
+; IASM-NOT: vmls.f64
+  ret double %v2
+}