Generate better two address code by using commutativity

For operations such as t0 = t1 + t2 Subzero's pattern for arithmetic operations generates two address code that looks like movl ...t1..., %ecx addl ...t2..., %ecx // t0 is in %ecx When register pressure is high this sometimes becomes: movl ...t2..., SPILL movl ...t1..., %ecx addl SPILL, %ecx // t0 is in %ecx This CL takes advantage of cases where the use of t2 is the last one, so the register that held t2 before the operation can be reused. The optimization simply swaps the (commutative) operation to t0 = t2 + t1 which then generates code as movl ...t2..., %ecx addl ...t1..., %ecx // t0 is in %ecx This optimization is used for any commutative operation, which now includes Fadd and Fmul, which were erroneously marked as non-commutative. See the rationale in IceInst.def for the IEEE wordings. BUG= R=jfb@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/1371703003 .

Generate better two address code by using commutativity
487bad02 · David Sehr · e11f878a · 487bad02 · 487bad02 · 487bad02
Commit 487bad02 authored Oct 06, 2015 by David Sehr
4 changed files
--- a/src/IceInst.def
+++ b/src/IceInst.def
@@ -14,14 +14,31 @@
 #ifndef SUBZERO_SRC_ICEINST_DEF
 #define SUBZERO_SRC_ICEINST_DEF

+// Floating point addition and multiplication are commutative.
+// 1) non-special values and infinities are required to commute.
+// 2) signed zeroes are handled by:
+//    From IEEE standard 754-2008:
+//      When the sum of two operands with opposite signs (or the difference of
+//      two operands with like signs) is exactly zero, the sign of that sum
+//      (or difference) shall be +0 in all rounding-direction attributes
+//      except roundTowardNegative; under that attribute, the sign of an exact
+//      zero sum (or difference) shall be −0.
+// 3) NaNs are handled by:
+//    http://grouper.ieee.org/groups/1788/email/msg03558.html
+//      clause of 754 at work is 6.2.3 NaN propagation:
+//      "If two or more inputs are NaN, then the payload of the resulting NaN
+//      should be identical to the payload of one of the input NaNs if
+//      representable in the destination format. This standard does not
+//      specify which of the input NaNs will provide the payload."
+
 #define ICEINSTARITHMETIC_TABLE                   \
  /* enum value, printable string, commutative */ \
  X(Add,         "add",            1)             \
-  X(Fadd,        "fadd",           0)             \
+  X(Fadd,        "fadd",           1)             \
  X(Sub,         "sub",            0)             \
  X(Fsub,        "fsub",           0)             \
  X(Mul,         "mul",            1)             \
-  X(Fmul,        "fmul",           0)             \
+  X(Fmul,        "fmul",           1)             \
  X(Udiv,        "udiv",           0)             \
  X(Sdiv,        "sdiv",           0)             \
  X(Fdiv,        "fdiv",           0)             \

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1293,10 +1293,23 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
  Operand *Src0 = legalize(Inst->getSrc(0));
  Operand *Src1 = legalize(Inst->getSrc(1));
  if (Inst->isCommutative()) {
-    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
+    uint32_t SwapCount = 0;
+    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
      std::swap(Src0, Src1);
-    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
+      ++SwapCount;
+    }
+    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      ++SwapCount;
+    }
+    // Improve two-address code patterns by avoiding a copy to the dest
+    // register when one of the source operands ends its lifetime here.
+    if (!Inst->isLastUse(Src0) && Inst->isLastUse(Src1)) {
      std::swap(Src0, Src1);
+      ++SwapCount;
+    }
+    assert(SwapCount <= 1);
+    (void) SwapCount;
  }
  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
    // These x86-32 helper-call-involved instructions are lowered in this

--- a/tests_lit/assembler/x86/opcode_register_encodings.ll
+++ b/tests_lit/assembler/x86/opcode_register_encodings.ll
@@ -16,14 +16,14 @@ entry:
 ; Test register and address mode encoding.
 define <8 x i16> @test_mul_v8i16_more_regs(<8 x i1> %cond, <8 x i16> %arg0, <8 x i16> %arg1, <8 x i16> %arg2, <8 x i16> %arg3, <8 x i16> %arg4, <8 x i16> %arg5, <8 x i16> %arg6, <8 x i16> %arg7, <8 x i16> %arg8) {
 entry:
-  %res1 = mul <8 x i16> %arg0, %arg1
-  %res2 = mul <8 x i16> %arg0, %arg2
-  %res3 = mul <8 x i16> %arg0, %arg3
-  %res4 = mul <8 x i16> %arg0, %arg4
-  %res5 = mul <8 x i16> %arg0, %arg5
-  %res6 = mul <8 x i16> %arg0, %arg6
-  %res7 = mul <8 x i16> %arg0, %arg7
-  %res8 = mul <8 x i16> %arg0, %arg8
+  %res1 = sub <8 x i16> %arg0, %arg1
+  %res2 = sub <8 x i16> %arg0, %arg2
+  %res3 = sub <8 x i16> %arg0, %arg3
+  %res4 = sub <8 x i16> %arg0, %arg4
+  %res5 = sub <8 x i16> %arg0, %arg5
+  %res6 = sub <8 x i16> %arg0, %arg6
+  %res7 = sub <8 x i16> %arg0, %arg7
+  %res8 = sub <8 x i16> %arg0, %arg8
  %res_acc1 = select <8 x i1> %cond, <8 x i16> %res1, <8 x i16> %res2
  %res_acc2 = select <8 x i1> %cond, <8 x i16> %res3, <8 x i16> %res4
  %res_acc3 = select <8 x i1> %cond, <8 x i16> %res5, <8 x i16> %res6
@@ -33,14 +33,14 @@ entry:
  %res = select <8 x i1> %cond, <8 x i16> %res_acc1_3, <8 x i16> %res_acc2_4
  ret <8 x i16> %res
 ; CHECK-LABEL: test_mul_v8i16_more_regs
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmullw xmm0,XMMWORD PTR [esp
-; CHECK-DAG: pmullw xmm1,XMMWORD PTR [esp
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubw xmm0,XMMWORD PTR [esp
+; CHECK-DAG: psubw xmm1,XMMWORD PTR [esp
 }

 define <4 x i32> @test_mul_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -53,14 +53,14 @@ entry:

 define <4 x i32> @test_mul_v4i32_more_regs(<4 x i1> %cond, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, <4 x i32> %arg4, <4 x i32> %arg5, <4 x i32> %arg6, <4 x i32> %arg7, <4 x i32> %arg8) {
 entry:
-  %res1 = mul <4 x i32> %arg0, %arg1
-  %res2 = mul <4 x i32> %arg0, %arg2
-  %res3 = mul <4 x i32> %arg0, %arg3
-  %res4 = mul <4 x i32> %arg0, %arg4
-  %res5 = mul <4 x i32> %arg0, %arg5
-  %res6 = mul <4 x i32> %arg0, %arg6
-  %res7 = mul <4 x i32> %arg0, %arg7
-  %res8 = mul <4 x i32> %arg0, %arg8
+  %res1 = sub <4 x i32> %arg0, %arg1
+  %res2 = sub <4 x i32> %arg0, %arg2
+  %res3 = sub <4 x i32> %arg0, %arg3
+  %res4 = sub <4 x i32> %arg0, %arg4
+  %res5 = sub <4 x i32> %arg0, %arg5
+  %res6 = sub <4 x i32> %arg0, %arg6
+  %res7 = sub <4 x i32> %arg0, %arg7
+  %res8 = sub <4 x i32> %arg0, %arg8
  %res_acc1 = select <4 x i1> %cond, <4 x i32> %res1, <4 x i32> %res2
  %res_acc2 = select <4 x i1> %cond, <4 x i32> %res3, <4 x i32> %res4
  %res_acc3 = select <4 x i1> %cond, <4 x i32> %res5, <4 x i32> %res6
@@ -70,14 +70,14 @@ entry:
  %res = select <4 x i1> %cond, <4 x i32> %res_acc1_3, <4 x i32> %res_acc2_4
  ret <4 x i32> %res
 ; CHECK-LABEL: test_mul_v4i32_more_regs
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
-; CHECK-DAG: pmulld xmm0,XMMWORD PTR [esp
-; CHECK-DAG: pmulld xmm1,XMMWORD PTR [esp
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,{{xmm[0-7]|xmmword ptr\[esp}}
+; CHECK-DAG: psubd xmm0,XMMWORD PTR [esp
+; CHECK-DAG: psubd xmm1,XMMWORD PTR [esp
 }

 ; Test movq, which is used by atomic stores.

--- a/tests_lit/llvm2ice_tests/commutativity.ll
+++ b/tests_lit/llvm2ice_tests/commutativity.ll
+; Test the lowering sequence for commutative operations.  If there is a source
+; operand whose lifetime ends in an operation, it should be the first operand,
+; eliminating the need for a move to start the new lifetime.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+define i32 @integerAddLeft(i32 %a, i32 %b) {
+entry:
+  %tmp = add i32 %a, %b
+  %result = add i32 %a, %tmp
+  ret i32 %result
+}
+; CHECK-LABEL: integerAddLeft
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: add {{e..}},{{e..}}
+; CHECK-NEXT: add {{e..}},{{e..}}
+
+define i32 @integerAddRight(i32 %a, i32 %b) {
+entry:
+  %tmp = add i32 %a, %b
+  %result = add i32 %b, %tmp
+  ret i32 %result
+}
+; CHECK-LABEL: integerAddRight
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: add {{e..}},{{e..}}
+; CHECK-NEXT: add {{e..}},{{e..}}
+
+define i32 @integerMultiplyLeft(i32 %a, i32 %b) {
+entry:
+  %tmp = mul i32 %a, %b
+  %result = mul i32 %a, %tmp
+  ret i32 %result
+}
+; CHECK-LABEL: integerMultiplyLeft
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: imul {{e..}},{{e..}}
+; CHECK-NEXT: imul {{e..}},{{e..}}
+
+define i32 @integerMultiplyRight(i32 %a, i32 %b) {
+entry:
+  %tmp = mul i32 %a, %b
+  %result = mul i32 %b, %tmp
+  ret i32 %result
+}
+; CHECK-LABEL: integerMultiplyRight
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: mov {{e..}},DWORD PTR
+; CHECK-NEXT: imul {{e..}},{{e..}}
+; CHECK-NEXT: imul {{e..}},{{e..}}
+
+define float @floatAddLeft(float %a, float %b) {
+entry:
+  %tmp = fadd float %a, %b
+  %result = fadd float %a, %tmp
+  ret float %result
+}
+; CHECK-LABEL: floatAddLeft
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: addss xmm1,xmm0
+; CHECK-NEXT: addss xmm0,xmm1
+
+define float @floatAddRight(float %a, float %b) {
+entry:
+  %tmp = fadd float %a, %b
+  %result = fadd float %b, %tmp
+  ret float %result
+}
+; CHECK-LABEL: floatAddRight
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: addss xmm0,xmm1
+; CHECK-NEXT: addss xmm1,xmm0
+
+define float @floatMultiplyLeft(float %a, float %b) {
+entry:
+  %tmp = fmul float %a, %b
+  %result = fmul float %a, %tmp
+  ret float %result
+}
+; CHECK-LABEL: floatMultiplyLeft
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: mulss xmm1,xmm0
+; CHECK-NEXT: mulss xmm0,xmm1
+
+define float @floatMultiplyRight(float %a, float %b) {
+entry:
+  %tmp = fmul float %a, %b
+  %result = fmul float %b, %tmp
+  ret float %result
+}
+; CHECK-LABEL: floatMultiplyRight
+; CHECK-NEXT: movss xmm0,DWORD PTR
+; CHECK-NEXT: movss xmm1,DWORD PTR
+; CHECK-NEXT: mulss xmm0,xmm1
+; CHECK-NEXT: mulss xmm1,xmm0