Subzero: Strength-reduce mul by certain constants.

These all appear to some degree in spec2k. This is implemented for i8/i16/i32 types. It is done as part of core lowering, so in theory all optimization levels could benefit, but it is explicitly disabled for Om1/O0 to keep things simple there. While clang appears to strength-reduce udiv/urem by a constant power of 2, for some reason it does not always strength-reduce multiplies (given that they appear in the spec2k bitcode). For multiplies by 3, 5, or 9, we can make use of the lea instruction. We can do combinations of shift and lea to multiply by other constants, e.g. 100=5*5*4. If too many operations would be required, just give up and use the mul instruction. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095 R=jpp@chromium.org, jvoung@chromium.org Review URL: https://codereview.chromium.org/1146803002

Subzero: Strength-reduce mul by certain constants.
0933c0cf · Jim Stichnoth · 326534a3 · 0933c0cf · 0933c0cf · 0933c0cf
Commit 0933c0cf authored Jun 12, 2015 by Jim Stichnoth
12 changed files
--- a/crosstest/crosstest.cfg
+++ b/crosstest/crosstest.cfg
@@ -42,6 +42,13 @@ test: test_select.ll
 driver: test_stacksave_main.c
 test: test_stacksave.c
+[test_strengthreduce]
+driver: test_strengthreduce_main.cpp
+test: test_strengthreduce.cpp
+# Disable clang-side optimizations so that pnacl-sz sees suitable
+# bitcode patterns.
+flags: --clang-opt=0
 [test_sync_atomic]
 driver: test_sync_atomic_main.cpp
 # Compile the non-Subzero object files straight from source since the native

--- a/crosstest/test_strengthreduce.cpp
+++ b/crosstest/test_strengthreduce.cpp
+//===- subzero/crosstest/test_strengthreduce.cpp - Strength reduction -----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation for crosstesting strength reduction.
+//
+//===----------------------------------------------------------------------===//
+#include "test_strengthreduce.h"
+// TODO(stichnot): Extend to i16 and i8 types, and also test the
+// commutativity transformations.  This may require hand-generating
+// .ll files, because of C/C++ integer promotion rules for arithmetic,
+// and because clang prefers to do its own commutativity
+// transformation.
+#define X(constant, suffix)                                                    \
+  uint32_t multiplyByConst##suffix(uint32_t Val) {                             \
+    return Val * (uint32_t)constant;                                           \
+  }                                                                            \
+  int32_t multiplyByConst##suffix(int32_t Val) {                               \
+    return Val * (int32_t)constant;                                            \
+  }
+CONST_TABLE
+#undef X
--- a/crosstest/test_strengthreduce.def
+++ b/crosstest/test_strengthreduce.def
+//===- subzero/crosstest/test_strengthreduce.def - macros -----*- C++ -*---===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines macros for crosstesting strength reduction.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TEST_STRENGTHREDUCE_DEF
+#define TEST_STRENGTHREDUCE_DEF
+#define XSTR(s) STR(s)
+#define STR(s) #s
+#define CONST_TABLE \
+  X(   -10,    _10) \
+  X(    -7,     _7) \
+  X(    -2,     _2) \
+  X(    -1,     _1) \
+  X(     0,      0) \
+  X(     1,      1) \
+  X(     2,      2) \
+  X(     3,      3) \
+  X(     4,      4) \
+  X(     5,      5) \
+  X(     7,      7) \
+  X(     9,      9) \
+  X(    10,     10) \
+  X(   100,    100) \
+  X(100000, 100000) \
+//#define X(constant, suffix)
+#endif // !TEST_STRENGTHREDUCE_DEF
--- a/crosstest/test_strengthreduce.h
+++ b/crosstest/test_strengthreduce.h
+//===- subzero/crosstest/test_strengthreduce.h - Prototypes ---*- C++ -*---===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes used for crosstesting strength
+// reduction.
+//
+//===----------------------------------------------------------------------===//
+#include <stdint.h>
+#include "test_strengthreduce.def"
+#define X(constant, suffix)                                                    \
+  uint32_t multiplyByConst##suffix(uint32_t val);                              \
+  int32_t multiplyByConst##suffix(int32_t val);
+CONST_TABLE
+#undef X
--- a/crosstest/test_strengthreduce_main.cpp
+++ b/crosstest/test_strengthreduce_main.cpp
+//===- subzero/crosstest/test_strengthreduce_main.cpp - Driver for tests --===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for crosstesting arithmetic strength-reducing optimizations.
+//
+//===----------------------------------------------------------------------===//
+/* crosstest.py --test=test_strengthreduce.cpp \
+   --driver=test_strengthreduce_main.cpp \
+   --prefix=Subzero_ --clang-opt=0 --output=test_strengthreduce */
+#include <iostream>
+// Include test_strengthreduce.h twice - once normally, and once
+// within the Subzero_ namespace, corresponding to the llc and Subzero
+// translated object files, respectively.
+#include "test_strengthreduce.h"
+namespace Subzero_ {
+#include "test_strengthreduce.h"
+}
+int main(int argc, char **argv) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+  static int32_t Values[] = {-100, -50, 0, 1, 8, 123, 0x33333333, 0x77777777};
+  for (size_t i = 0; i < sizeof(Values) / sizeof(*Values); ++i) {
+    int32_t SVal = Values[i];
+    int32_t ResultLlcS, ResultSzS;
+    uint32_t UVal = (uint32_t)Values[i];
+    int32_t ResultLlcU, ResultSzU;
+#define X(constant, suffix)                                                    \
+  ResultLlcS = multiplyByConst##suffix(UVal);                                  \
+  ResultSzS = Subzero_::multiplyByConst##suffix(UVal);                         \
+  if (ResultLlcS == ResultSzS) {                                               \
+    ++Passes;                                                                  \
+  } else {                                                                     \
+    ++Failures;                                                                \
+    std::cout << "multiplyByConstS" STR(suffix) "(" << SVal                    \
+              << "): sz=" << ResultSzS << " llc=" << ResultLlcS << "\n";       \
+  }                                                                            \
+  ResultLlcU = multiplyByConst##suffix(UVal);                                  \
+  ResultSzU = Subzero_::multiplyByConst##suffix(UVal);                         \
+  if (ResultLlcU == ResultSzU) {                                               \
+    ++Passes;                                                                  \
+  } else {                                                                     \
+    ++Failures;                                                                \
+    std::cout << "multiplyByConstU" STR(suffix) "(" << UVal                    \
+              << "): sz=" << ResultSzU << " llc=" << ResultLlcU << "\n";       \
+  }
+    CONST_TABLE
+#undef X
+  }
+  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+            << " Failures=" << Failures << "\n";
+  return Failures;
+}
--- a/pydir/crosstest.py
+++ b/pydir/crosstest.py
@@ -40,9 +40,11 @@ def main():
    argparser.add_argument('-O', required=False, default='2', dest='optlevel',
                           choices=['m1', '-1', '0', '1', '2'],
                           metavar='OPTLEVEL',
-                           help='Optimization level ' +
+                           help='Optimization level for llc and Subzero ' +
                                '(m1 and -1 are equivalent).' +
                                ' Default %(default)s.')
+    argparser.add_argument('--clang-opt', required=False, default=True,
+                           dest='clang_opt')
    argparser.add_argument('--mattr',  required=False, default='sse2',
                           dest='attr', choices=['sse2', 'sse4.1'],
                           metavar='ATTRIBUTE',
@@ -92,7 +94,8 @@ def main():
            bitcode_nonfinal = os.path.join(args.dir, base + '.' + key + '.bc')
            bitcode = os.path.join(args.dir, base + '.' + key + '.pnacl.ll')
            shellcmd(['{bin}/pnacl-clang'.format(bin=bindir),
-                      '-O2', '-c', arg, '-o', bitcode_nonfinal])
+                      ('-O2' if args.clang_opt else '-O0'), '-c', arg,
+                      '-o', bitcode_nonfinal])
            shellcmd(['{bin}/pnacl-opt'.format(bin=bindir),
                      '-pnacl-abi-simplify-preopt',
                      '-pnacl-abi-simplify-postopt',

--- a/src/IceELFObjectWriter.cpp
+++ b/src/IceELFObjectWriter.cpp
@@ -383,8 +383,9 @@ void ELFObjectWriter::writeDataOfType(SectionType ST,
      for (VariableDeclaration::Initializer *Init : Var->getInitializers()) {
        switch (Init->getKind()) {
        case VariableDeclaration::Initializer::DataInitializerKind: {
-          const auto Data = llvm::cast<VariableDeclaration::DataInitializer>(
+          const auto Data =
-                                Init)->getContents();
+              llvm::cast<VariableDeclaration::DataInitializer>(Init)
+                  ->getContents();
          Section->appendData(Str, llvm::StringRef(Data.data(), Data.size()));
          break;
        }

--- a/src/IceGlobalInits.h
+++ b/src/IceGlobalInits.h
@@ -291,9 +291,7 @@ public:
    return isExternal() && !hasInitializer();
  }
-  void setSuppressMangling() {
+  void setSuppressMangling() { ForceSuppressMangling = true; }
-    ForceSuppressMangling = true;
-  }
 private:
  // list of initializers for the declared variable.

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -560,6 +560,8 @@ protected:
    Context.getLastInserted()->setDestNonKillable();
  }
+  bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
  const X86InstructionSet InstructionSet;
  bool IsEbpBasedFrame;
  bool NeedsStackAlignment;

--- a/tests_lit/assembler/x86/immediate_encodings.ll
+++ b/tests_lit/assembler/x86/immediate_encodings.ll
@@ -197,25 +197,25 @@ entry:
 define internal i32 @testMul16Imm16(i32 %arg) {
 entry:
  %arg_i16 = trunc i32 %arg to i16
-  %tmp = mul i16 %arg_i16, 1024
+  %tmp = mul i16 %arg_i16, 1025
  %result_i16 = add i16 %tmp, 1
  %result = zext i16 %result_i16 to i32
  ret i32 %result
 }
 ; CHECK-LABEL: testMul16Imm16
-; CHECK: 66 69 c0 00 04  imul ax,ax
+; CHECK: 66 69 c0 01 04  imul ax,ax
 ; CHECK-NEXT: add ax,0x1
 define internal i32 @testMul16Imm16Neg(i32 %arg) {
 entry:
  %arg_i16 = trunc i32 %arg to i16
-  %tmp = mul i16 %arg_i16, -256
+  %tmp = mul i16 %arg_i16, -255
  %result_i16 = add i16 %tmp, 1
  %result = zext i16 %result_i16 to i32
  ret i32 %result
 }
 ; CHECK-LABEL: testMul16Imm16Neg
-; CHECK: 66 69 c0 00 ff  imul ax,ax
+; CHECK: 66 69 c0 01 ff  imul ax,ax,0xff01
 ; CHECK-NEXT: add ax,0x1
 define internal i32 @testMul32Imm8(i32 %arg) {
@@ -236,19 +236,19 @@ entry:
 define internal i32 @testMul32Imm16(i32 %arg) {
 entry:
-  %result = mul i32 %arg, 1024
+  %result = mul i32 %arg, 1025
  ret i32 %result
 }
 ; CHECK-LABEL: testMul32Imm16
-; CHECK: 69 c0 00 04 00 00  imul eax,eax
+; CHECK: 69 c0 01 04 00 00  imul eax,eax
 define internal i32 @testMul32Imm16Neg(i32 %arg) {
 entry:
-  %result = mul i32 %arg, -256
+  %result = mul i32 %arg, -255
  ret i32 %result
 }
 ; CHECK-LABEL: testMul32Imm16Neg
-; CHECK: 69 c0 00 ff ff ff  imul eax,eax
+; CHECK: 69 c0 01 ff ff ff  imul eax,eax,0xffffff01
 ; The GPR shift instructions either allow an 8-bit immediate or
 ; have a special encoding for "1".

--- a/tests_lit/llvm2ice_tests/strength-reduce.ll
+++ b/tests_lit/llvm2ice_tests/strength-reduce.ll
+; This tests various strength reduction operations.
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+define internal i32 @mul_i32_arg_5(i32 %arg) {
+  %result = mul i32 %arg, 5
+  ret i32 %result
+}
+; CHECK-LABEL: mul_i32_arg_5
+; CHECK: lea [[REG:e..]],{{\[}}[[REG]]+[[REG]]*4]
+define internal i32 @mul_i32_5_arg(i32 %arg) {
+  %result = mul i32 5, %arg
+  ret i32 %result
+}
+; CHECK-LABEL: mul_i32_5_arg
+; CHECK: lea [[REG:e..]],{{\[}}[[REG]]+[[REG]]*4]
+define internal i32 @mul_i32_arg_18(i32 %arg) {
+  %result = mul i32 %arg, 18
+  ret i32 %result
+}
+; CHECK-LABEL: mul_i32_arg_18
+; CHECK-DAG: lea [[REG:e..]],{{\[}}[[REG]]+[[REG]]*8]
+; CHECK-DAG: shl [[REG]],1
+define internal i32 @mul_i32_arg_27(i32 %arg) {
+  %result = mul i32 %arg, 27
+  ret i32 %result
+}
+; CHECK-LABEL: mul_i32_arg_27
+; CHECK-DAG: lea [[REG:e..]],{{\[}}[[REG]]+[[REG]]*2]
+; CHECK-DAG: lea [[REG]],{{\[}}[[REG]]+[[REG]]*8]
+define internal i32 @mul_i32_arg_m45(i32 %arg) {
+  %result = mul i32 %arg, -45
+  ret i32 %result
+}
+; CHECK-LABEL: mul_i32_arg_m45
+; CHECK-DAG: lea [[REG:e..]],{{\[}}[[REG]]+[[REG]]*8]
+; CHECK-DAG: lea [[REG]],{{\[}}[[REG]]+[[REG]]*4]
+; CHECK: neg [[REG]]
+define internal i16 @mul_i16_arg_18(i16 %arg) {
+  %result = mul i16 %arg, 18
+  ret i16 %result
+}
+; Disassembly will look like "lea ax,[eax+eax*8]".
+; CHECK-LABEL: mul_i16_arg_18
+; CHECK-DAG: lea [[REG:..]],{{\[}}e[[REG]]+e[[REG]]*8]
+; CHECK-DAG: shl [[REG]],1
+define internal i8 @mul_i8_arg_16(i8 %arg) {
+  %result = mul i8 %arg, 16
+  ret i8 %result
+}
+; CHECK-LABEL: mul_i8_arg_16
+; CHECK: shl {{.*}},0x4
+define internal i8 @mul_i8_arg_18(i8 %arg) {
+  %result = mul i8 %arg, 18
+  ret i8 %result
+}
+; CHECK-LABEL: mul_i8_arg_18
+; CHECK: imul