Subzero. ARM32. Strength reduce multiplications.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1469113003 .

Subzero. ARM32. Strength reduce multiplications.
98cc08ca · John Porto · 614140e2 · 98cc08ca · 98cc08ca · 98cc08ca
Commit 98cc08ca authored Nov 24, 2015 by John Porto
7 changed files
--- a/crosstest/test_arith.cpp
+++ b/crosstest/test_arith.cpp
@@ -49,3 +49,37 @@ SINTOP_TABLE
  v4f32 test##inst(v4f32 a, v4f32 b) { return func(a op b); }
 FPOP_TABLE
 #undef X
+
+#define X(mult_by)                                                             \
+  bool testMultiplyBy##mult_by(bool a, bool /*unused*/) {                      \
+    return a * (mult_by);                                                      \
+  }                                                                            \
+  bool testMultiplyByNeg##mult_by(bool a, bool /*unused*/) {                   \
+    return a * (-(mult_by));                                                   \
+  }                                                                            \
+  uint8_t testMultiplyBy##mult_by(uint8_t a, uint8_t /*unused*/) {             \
+    return a * (mult_by);                                                      \
+  }                                                                            \
+  uint8_t testMultiplyByNeg##mult_by(uint8_t a, uint8_t /*unused*/) {          \
+    return a * (-(mult_by));                                                   \
+  }                                                                            \
+  uint16_t testMultiplyBy##mult_by(uint16_t a, uint16_t /*unused*/) {          \
+    return a * (mult_by);                                                      \
+  }                                                                            \
+  uint16_t testMultiplyByNeg##mult_by(uint16_t a, uint16_t /*unused*/) {       \
+    return a * (-(mult_by));                                                   \
+  }                                                                            \
+  uint32_t testMultiplyBy##mult_by(uint32_t a, uint32_t /*unused*/) {          \
+    return a * (mult_by);                                                      \
+  }                                                                            \
+  uint32_t testMultiplyByNeg##mult_by(uint32_t a, uint32_t /*unused*/) {       \
+    return a * (-(mult_by));                                                   \
+  }                                                                            \
+  uint64_t testMultiplyBy##mult_by(uint64_t a, uint64_t /*unused*/) {          \
+    return a * (mult_by);                                                      \
+  }                                                                            \
+  uint64_t testMultiplyByNeg##mult_by(uint64_t a, uint64_t /*unused*/) {       \
+    return a * (-(mult_by));                                                   \
+  }
+MULIMM_TABLE
+#undef X
--- a/crosstest/test_arith.def
+++ b/crosstest/test_arith.def
@@ -17,35 +17,35 @@
 #define XSTR(s) STR(s)
 #define STR(s) #s

-#define UINTOP_TABLE                 \
-  /* inst, operator, div, shift */   \
-  X(Add,   +,        0,   0)         \
-  X(Sub,   -,        0,   0)         \
-  X(Mul,   *,        0,   0)         \
-  X(Udiv,  /,        1,   0)         \
-  X(Urem,  %,        1,   0)         \
-  X(Shl,   <<,       0,   1)         \
-  X(Lshr,  >>,       0,   1)         \
-  X(And,   &,        0,   0)         \
-  X(Or,    |,        0,   0)         \
-  X(Xor,   ^,        0,   0)         \
+#define UINTOP_TABLE                                                           \
+  /* inst, operator, div, shift */                                             \
+  X(Add,   +,        0,   0)                                                   \
+  X(Sub,   -,        0,   0)                                                   \
+  X(Mul,   *,        0,   0)                                                   \
+  X(Udiv,  /,        1,   0)                                                   \
+  X(Urem,  %,        1,   0)                                                   \
+  X(Shl,   <<,       0,   1)                                                   \
+  X(Lshr,  >>,       0,   1)                                                   \
+  X(And,   &,        0,   0)                                                   \
+  X(Or,    |,        0,   0)                                                   \
+  X(Xor,   ^,        0,   0)                                                   \
 //#define X(inst, op, isdiv, isshift)

-#define SINTOP_TABLE                 \
-  /* inst, operator, div, shift */   \
-  X(Sdiv,  /,        1,   0)         \
-  X(Srem,  %,        1,   0)         \
-  X(Ashr,  >>,       0,   1)         \
+#define SINTOP_TABLE                                                           \
+  /* inst, operator, div, shift */                                             \
+  X(Sdiv,  /,        1,   0)                                                   \
+  X(Srem,  %,        1,   0)                                                   \
+  X(Ashr,  >>,       0,   1)                                                   \
 //#define X(inst, op, isdiv, isshift)

 #define COMMA ,
-#define FPOP_TABLE           \
-  /* inst, infix_op, func */ \
-  X(Fadd,  +,              ) \
-  X(Fsub,  -,              ) \
-  X(Fmul,  *,              ) \
-  X(Fdiv,  /,              ) \
-  X(Frem,  COMMA,    myFrem) \
+#define FPOP_TABLE                                                             \
+  /* inst, infix_op, func */                                                   \
+  X(Fadd,  +,              )                                                   \
+  X(Fsub,  -,              )                                                   \
+  X(Fmul,  *,              )                                                   \
+  X(Fdiv,  /,              )                                                   \
+  X(Frem,  COMMA,    myFrem)                                                   \
 //#define X(inst, op, func)

 // Note: The above definition of COMMA, plus the "func" argument to
@@ -55,30 +55,51 @@
 // instruction and "(a + b)" for the Fadd instruction.  The two
 // versions of myFrem() are defined in a separate bitcode file.

-#define INT_VALUE_ARRAY                           \
-{ 0x0,        0x1,        0x7ffffffe, 0x7fffffff, \
-  0x80000000, 0x80000001, 0xfffffffe, 0xffffffff, \
-  0x1e, 0x1f, 0x20, 0x21, 0x3e, 0x3f, 0x40, 0x41, \
-  0x7e,       0x7f,       0x80,       0x81,       \
-  0xfe,       0xff,       0x100,      0x101,      \
-  0x7ffe,     0x7fff,     0x8000,     0x8001,     \
+#define INT_VALUE_ARRAY                                                        \
+{ 0x0,        0x1,        0x7ffffffe, 0x7fffffff,                              \
+  0x80000000, 0x80000001, 0xfffffffe, 0xffffffff,                              \
+  0x1e, 0x1f, 0x20, 0x21, 0x3e, 0x3f, 0x40, 0x41,                              \
+  0x7e,       0x7f,       0x80,       0x81,                                    \
+  0xfe,       0xff,       0x100,      0x101,                                   \
+  0x7ffe,     0x7fff,     0x8000,     0x8001,                                  \
  0xfffe,     0xffff,     0x10000,    0x10001 }

-#define FP_VALUE_ARRAY(NegInf, PosInf, NegNan, NaN)                 \
-{ 0,                    1,                    1.4,                  \
-  1.5,                  1.6,                  -1.4,                 \
-  -1.5,                 -1.6,                 0x7e,                 \
-  0x7f,                 0x80,                 0x81,                 \
-  0xfe,                 0xff,                 0x7ffe,               \
-  0x7fff,               0x8000,               0x8001,               \
-  0xfffe,               0xffff,               0x7ffffffe,           \
-  0x7fffffff,           0x80000000,           0x80000001,           \
-  0xfffffffe,           0xffffffff,           0x100000000ll,        \
-  0x100000001ll,        0x7ffffffffffffffell, 0x7fffffffffffffffll, \
-  0x8000000000000000ll, 0x8000000000000001ll, 0xfffffffffffffffell, \
-  0xffffffffffffffffll, NegInf,               PosInf,               \
-  Nan,                  NegNan,               -0.0,                 \
-  10.0,                 FLT_MIN,              FLT_MAX,              \
+#define FP_VALUE_ARRAY(NegInf, PosInf, NegNan, NaN)                            \
+{ 0,                    1,                    1.4,                             \
+  1.5,                  1.6,                  -1.4,                            \
+  -1.5,                 -1.6,                 0x7e,                            \
+  0x7f,                 0x80,                 0x81,                            \
+  0xfe,                 0xff,                 0x7ffe,                          \
+  0x7fff,               0x8000,               0x8001,                          \
+  0xfffe,               0xffff,               0x7ffffffe,                      \
+  0x7fffffff,           0x80000000,           0x80000001,                      \
+  0xfffffffe,           0xffffffff,           0x100000000ll,                   \
+  0x100000001ll,        0x7ffffffffffffffell, 0x7fffffffffffffffll,            \
+  0x8000000000000000ll, 0x8000000000000001ll, 0xfffffffffffffffell,            \
+  0xffffffffffffffffll, NegInf,               PosInf,                          \
+  Nan,                  NegNan,               -0.0,                            \
+  10.0,                 FLT_MIN,              FLT_MAX,                         \
  DBL_MIN,              DBL_MAX }

+#define MULIMM_TABLE                                                           \
+   /* mult_by */                                                               \
+  X(         0)                                                                \
+  X(         1)                                                                \
+  X(         2)                                                                \
+  X(         3)                                                                \
+  X(         4)                                                                \
+  X(         5)                                                                \
+  X(         7)                                                                \
+  X(         8)                                                                \
+  X(         9)                                                                \
+  X(        10)                                                                \
+  X(        25)                                                                \
+  X(       100)                                                                \
+  X(       232)                                                                \
+  X(0x00FFF001)                                                                \
+  X(0x01000000)                                                                \
+  X(0x7FFFF07F)                                                                \
+  X(0x80000000)                                                                \
+//#define X(mult_by)
+
 #endif // TEST_ARITH_DEF
--- a/crosstest/test_arith.h
+++ b/crosstest/test_arith.h
@@ -60,3 +60,17 @@ double mySqrt(double a);
 float myFabs(float a);
 double myFabs(double a);
 v4f32 myFabs(v4f32 a);
+
+#define X(mult_by)                                                             \
+  bool testMultiplyBy##mult_by(bool a, bool);                                  \
+  bool testMultiplyByNeg##mult_by(bool a, bool);                               \
+  uint8_t testMultiplyBy##mult_by(uint8_t a, uint8_t);                         \
+  uint8_t testMultiplyByNeg##mult_by(uint8_t a, uint8_t);                      \
+  uint16_t testMultiplyBy##mult_by(uint16_t a, uint16_t);                      \
+  uint16_t testMultiplyByNeg##mult_by(uint16_t a, uint16_t);                   \
+  uint32_t testMultiplyBy##mult_by(uint32_t a, uint32_t);                      \
+  uint32_t testMultiplyByNeg##mult_by(uint32_t a, uint32_t);                   \
+  uint64_t testMultiplyBy##mult_by(uint64_t a, uint64_t);                      \
+  uint64_t testMultiplyByNeg##mult_by(uint64_t a, uint64_t);
+MULIMM_TABLE
+#undef X
--- a/crosstest/test_arith_main.cpp
+++ b/crosstest/test_arith_main.cpp
@@ -73,7 +73,15 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
  ,
          SINTOP_TABLE
 #undef X
-  };
+#define X(mult_by)                                                             \
+  {                                                                            \
+    "Mult-By-" STR(mult_by), testMultiplyBy##mult_by,                          \
+        Subzero_::testMultiplyBy##mult_by, NULL, NULL, false                   \
+  }                                                                            \
+  , {"Mult-By-Neg-" STR(mult_by), testMultiplyByNeg##mult_by,                  \
+     Subzero_::testMultiplyByNeg##mult_by, NULL, NULL, false},
+              MULIMM_TABLE};
+#undef X
  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);

  if (sizeof(TypeUnsigned) <= sizeof(uint32_t)) {

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/MathExtras.h"

 #include <algorithm>
+#include <array>
 #include <utility>

 namespace Ice {
@@ -2036,6 +2037,154 @@ void TargetARM32::lowerInt64Arithmetic(InstArithmetic::OpKind Op,
  }
 }

+namespace {
+// StrengthReduction is a namespace with the strength reduction machinery. The
+// entry point is the StrengthReduction::tryToOptimize method. It returns true
+// if the optimization can be performed, and false otherwise.
+//
+// If the optimization can be performed, tryToOptimize sets its NumOperations
+// parameter to the number of shifts that are needed to perform the
+// multiplication; and it sets the Operations parameter with <ShAmt, AddOrSub>
+// tuples that describe how to materialize the multiplication.
+//
+// The algorithm finds contiguous 1s in the Multiplication source, and uses one
+// or two shifts to materialize it. A sequence of 1s, e.g.,
+//
+//                  M           N
+//   ...00000000000011111...111110000000...
+//
+// is materializable with (1 << (M + 1)) - (1 << N):
+//
+//   ...00000000000100000...000000000000...      [1 << (M + 1)]
+//   ...00000000000000000...000010000000... (-)  [1 << N]
+//   --------------------------------------
+//   ...00000000000011111...111110000000...
+//
+// And a single bit set, which is just a left shift.
+namespace StrengthReduction {
+enum AggregationOperation {
+  AO_Invalid,
+  AO_Add,
+  AO_Sub,
+};
+
+// AggregateElement is a glorified <ShAmt, AddOrSub> tuple.
+class AggregationElement {
+  AggregationElement(const AggregationElement &) = delete;
+
+public:
+  AggregationElement() = default;
+  AggregationElement &operator=(const AggregationElement &) = default;
+  AggregationElement(AggregationOperation Op, uint32_t ShAmt)
+      : Op(Op), ShAmt(ShAmt) {}
+
+  Operand *createShiftedOperand(Cfg *Func, Variable *OpR) const {
+    assert(OpR->mustHaveReg());
+    if (ShAmt == 0) {
+      return OpR;
+    }
+    return OperandARM32FlexReg::create(
+        Func, IceType_i32, OpR, OperandARM32::LSL,
+        OperandARM32ShAmtImm::create(
+            Func, llvm::cast<ConstantInteger32>(
+                      Func->getContext()->getConstantInt32(ShAmt))));
+  }
+
+  bool aggregateWithAdd() const {
+    switch (Op) {
+    case AO_Invalid:
+      llvm::report_fatal_error("Invalid Strength Reduction Operations.");
+    case AO_Add:
+      return true;
+    case AO_Sub:
+      return false;
+    }
+  }
+
+  uint32_t shAmt() const { return ShAmt; }
+
+private:
+  AggregationOperation Op = AO_Invalid;
+  uint32_t ShAmt;
+};
+
+// [RangeStart, RangeEnd] is a range of 1s in Src.
+template <std::size_t N>
+bool addOperations(uint32_t RangeStart, uint32_t RangeEnd, SizeT *NumOperations,
+                   std::array<AggregationElement, N> *Operations) {
+  assert(*NumOperations < N);
+  if (RangeStart == RangeEnd) {
+    // Single bit set:
+    // Src           : 0...00010...
+    // RangeStart    :        ^
+    // RangeEnd      :        ^
+    // NegSrc        : 0...00001...
+    (*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart);
+    ++(*NumOperations);
+    return true;
+  }
+
+  // Sequence of 1s: (two operations required.)
+  // Src           : 0...00011...110...
+  // RangeStart    :        ^
+  // RangeEnd      :              ^
+  // NegSrc        : 0...00000...001...
+  if (*NumOperations + 1 >= N) {
+    return false;
+  }
+  (*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart + 1);
+  ++(*NumOperations);
+  (*Operations)[*NumOperations] = AggregationElement(AO_Sub, RangeEnd);
+  ++(*NumOperations);
+  return true;
+}
+
+// tryToOptmize scans Src looking for sequences of 1s (including the unitary bit
+// 1 surrounded by zeroes.
+template <std::size_t N>
+bool tryToOptimize(uint32_t Src, SizeT *NumOperations,
+                   std::array<AggregationElement, N> *Operations) {
+  constexpr uint32_t SrcSizeBits = sizeof(Src) * CHAR_BIT;
+  uint32_t NegSrc = ~Src;
+
+  *NumOperations = 0;
+  while (Src != 0 && *NumOperations < N) {
+    // Each step of the algorithm:
+    //   * finds L, the last bit set in Src;
+    //   * clears all the upper bits in NegSrc up to bit L;
+    //   * finds nL, the last bit set in NegSrc;
+    //   * clears all the upper bits in Src up to bit nL;
+    //
+    // if L == nL + 1, then a unitary 1 was found in Src. Otherwise, a sequence
+    // of 1s starting at L, and ending at nL + 1, was found.
+    const uint32_t SrcLastBitSet = llvm::findLastSet(Src);
+    const uint32_t NegSrcClearMask =
+        (SrcLastBitSet == 0) ? 0
+                             : (0xFFFFFFFFu) >> (SrcSizeBits - SrcLastBitSet);
+    NegSrc &= NegSrcClearMask;
+    if (NegSrc == 0) {
+      if (addOperations(SrcLastBitSet, 0, NumOperations, Operations)) {
+        return true;
+      }
+      return false;
+    }
+    const uint32_t NegSrcLastBitSet = llvm::findLastSet(NegSrc);
+    assert(NegSrcLastBitSet < SrcLastBitSet);
+    const uint32_t SrcClearMask =
+        (NegSrcLastBitSet == 0) ? 0 : (0xFFFFFFFFu) >>
+                                          (SrcSizeBits - NegSrcLastBitSet);
+    Src &= SrcClearMask;
+    if (!addOperations(SrcLastBitSet, NegSrcLastBitSet + 1, NumOperations,
+                       Operations)) {
+      return false;
+    }
+  }
+
+  return Src == 0;
+}
+} // end of namespace StrengthReduction
+} // end of anonymous namespace
+
 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
  Variable *Dest = Inst->getDest();

@@ -2044,29 +2193,30 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
    return;
  }

-  if (Dest->getType() == IceType_i1) {
+  Type DestTy = Dest->getType();
+  if (DestTy == IceType_i1) {
    lowerInt1Arithmetic(Inst);
    return;
  }

  Operand *Src0 = legalizeUndef(Inst->getSrc(0));
  Operand *Src1 = legalizeUndef(Inst->getSrc(1));
-  if (Dest->getType() == IceType_i64) {
+  if (DestTy == IceType_i64) {
    lowerInt64Arithmetic(Inst->getOp(), Inst->getDest(), Src0, Src1);
    return;
  }

-  if (isVectorType(Dest->getType())) {
+  if (isVectorType(DestTy)) {
    // Add a fake def to keep liveness consistent in the meantime.
-    Variable *T = makeReg(Dest->getType());
+    Variable *T = makeReg(DestTy);
    Context.insert(InstFakeDef::create(Func, T));
    _mov(Dest, T);
    UnimplementedError(Func->getContext()->getFlags());
    return;
  }

-  // Dest->getType() is a non-i64 scalar.
-  Variable *T = makeReg(Dest->getType());
+  // DestTy is a non-i64 scalar.
+  Variable *T = makeReg(DestTy);

  // * Handle div/rem separately. They require a non-legalized Src1 to inspect
  // whether or not Src1 is a non-zero constant. Once legalized it is more
@@ -2107,7 +2257,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
  case InstArithmetic::Frem: {
    constexpr SizeT MaxSrcs = 2;
    Variable *Src0R = legalizeToReg(Src0);
-    Type Ty = Dest->getType();
+    Type Ty = DestTy;
    InstCall *Call = makeHelperCall(
        isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
    Call->addArg(Src0R);
@@ -2205,8 +2355,6 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
  }
  case InstArithmetic::Sub: {
    if (Srcs.hasConstOperand()) {
-      // TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed
-      // to be used.
      if (Srcs.immediateIsFlexEncodable()) {
        Variable *Src0R = Srcs.src0R(this);
        Operand *Src1RF = Srcs.src1RF(this);
@@ -2233,6 +2381,85 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
    return;
  }
  case InstArithmetic::Mul: {
+    const bool OptM1 = Ctx->getFlags().getOptLevel() == Opt_m1;
+    if (!OptM1 && Srcs.hasConstOperand()) {
+      constexpr std::size_t MaxShifts = 4;
+      std::array<StrengthReduction::AggregationElement, MaxShifts> Shifts;
+      SizeT NumOperations;
+      int32_t Const = Srcs.getConstantValue();
+      const bool Invert = Const < 0;
+      const bool MultiplyByZero = Const == 0;
+      Operand *_0 =
+          legalize(Ctx->getConstantZero(DestTy), Legal_Reg | Legal_Flex);
+
+      if (MultiplyByZero) {
+        _mov(T, _0);
+        _mov(Dest, T);
+        return;
+      }
+
+      if (Invert) {
+        Const = -Const;
+      }
+
+      if (StrengthReduction::tryToOptimize(Const, &NumOperations, &Shifts)) {
+        assert(NumOperations >= 1);
+        Variable *Src0R = Srcs.src0R(this);
+        int32_t Start;
+        int32_t End;
+        if (NumOperations == 1 || Shifts[NumOperations - 1].shAmt() != 0) {
+          // Multiplication by a power of 2 (NumOperations == 1); or
+          // Multiplication by a even number not a power of 2.
+          Start = 1;
+          End = NumOperations;
+          assert(Shifts[0].aggregateWithAdd());
+          _lsl(T, Src0R, shAmtImm(Shifts[0].shAmt()));
+        } else {
+          // Multiplication by an odd number. Put the free barrel shifter to a
+          // good use.
+          Start = 0;
+          End = NumOperations - 2;
+          const StrengthReduction::AggregationElement &Last =
+              Shifts[NumOperations - 1];
+          const StrengthReduction::AggregationElement &SecondToLast =
+              Shifts[NumOperations - 2];
+          if (!Last.aggregateWithAdd()) {
+            assert(SecondToLast.aggregateWithAdd());
+            _rsb(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));
+          } else if (!SecondToLast.aggregateWithAdd()) {
+            assert(Last.aggregateWithAdd());
+            _sub(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));
+          } else {
+            _add(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));
+          }
+        }
+
+        // Odd numbers :   S                                 E   I   I
+        //               +---+---+---+---+---+---+ ... +---+---+---+---+
+        //     Shifts  = |   |   |   |   |   |   | ... |   |   |   |   |
+        //               +---+---+---+---+---+---+ ... +---+---+---+---+
+        // Even numbers:   I   S                                     E
+        //
+        // S: Start; E: End; I: Init
+        for (int32_t I = Start; I < End; ++I) {
+          const StrengthReduction::AggregationElement &Current = Shifts[I];
+          Operand *SrcF = Current.createShiftedOperand(Func, Src0R);
+          if (Current.aggregateWithAdd()) {
+            _add(T, T, SrcF);
+          } else {
+            _sub(T, T, SrcF);
+          }
+        }
+
+        if (Invert) {
+          // T = 0 - T.
+          _rsb(T, T, _0);
+        }
+
+        _mov(Dest, T);
+        return;
+      }
+    }
    Variable *Src0R = Srcs.unswappedSrc0R(this);
    Variable *Src1R = Srcs.unswappedSrc1R(this);
    _mul(T, Src0R, Src1R);
@@ -2248,7 +2475,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
  }
  case InstArithmetic::Lshr: {
    Variable *Src0R = Srcs.unswappedSrc0R(this);
-    if (Dest->getType() != IceType_i32) {
+    if (DestTy != IceType_i32) {
      _uxt(Src0R, Src0R);
    }
    _lsr(T, Src0R, Srcs.unswappedSrc1RShAmtImm(this));
@@ -2257,7 +2484,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
  }
  case InstArithmetic::Ashr: {
    Variable *Src0R = Srcs.unswappedSrc0R(this);
-    if (Dest->getType() != IceType_i32) {
+    if (DestTy != IceType_i32) {
      _sxt(Src0R, Src0R);
    }
    _asr(T, Src0R, Srcs.unswappedSrc1RShAmtImm(this));

--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -977,15 +977,16 @@ private:
  // AllowTemporaryWithNoReg indicates if TargetARM32::makeReg() can be invoked
  // without specifying a physical register. This is needed for creating unbound
  // temporaries during Ice -> ARM lowering, but before register allocation.
-  // This a safe-guard that, during the legalization post-passes no unbound
-  // temporaries are created.
+  // This a safe-guard that no unbound temporaries are created during the
+  // legalization post-passes.
  bool AllowTemporaryWithNoReg = true;
  // ForbidTemporaryWithoutReg is a RAII class that manages
  // AllowTemporaryWithNoReg.
  class ForbidTemporaryWithoutReg {
    ForbidTemporaryWithoutReg() = delete;
-    ForbidTemporaryWithoutReg(const ForbidTemporaryWithoutReg&) = delete;
-    ForbidTemporaryWithoutReg &operator=(const ForbidTemporaryWithoutReg&) = delete;
+    ForbidTemporaryWithoutReg(const ForbidTemporaryWithoutReg &) = delete;
+    ForbidTemporaryWithoutReg &
+    operator=(const ForbidTemporaryWithoutReg &) = delete;

  public:
    explicit ForbidTemporaryWithoutReg(TargetARM32 *Target) : Target(Target) {

--- a/tests_lit/llvm2ice_tests/arith.ll
+++ b/tests_lit/llvm2ice_tests/arith.ll
@@ -11,7 +11,7 @@
 ; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
 ; RUN:   -i %s --args -O2 --skip-unimplemented \
 ; RUN:   | %if --need=target_ARM32 --need=allow_dump \
-; RUN:   --command FileCheck --check-prefix ARM32 %s
+; RUN:   --command FileCheck --check-prefix ARM32 --check-prefix ARM-OPT2 %s
 ; RUN: %if --need=target_ARM32 --need=allow_dump \
 ; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
 ; RUN:   -i %s --args -O2 --mattr=hwdiv-arm --skip-unimplemented \
@@ -21,7 +21,7 @@
 ; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
 ; RUN:   -i %s --args -Om1 --skip-unimplemented \
 ; RUN:   | %if --need=target_ARM32 --need=allow_dump \
-; RUN:   --command FileCheck --check-prefix ARM32 %s
+; RUN:   --command FileCheck --check-prefix ARM32 --check-prefix ARM32-OPTM1 %s
 ;
 ; RUN: %if --need=target_MIPS32 --need=allow_dump \
 ; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target mips32\
@@ -117,8 +117,11 @@ entry:
 ; CHECK-LABEL: MulImm
 ; CHECK: imul e{{.*}},e{{.*}},0x63
 ; ARM32-LABEL: MulImm
-; ARM32: movw {{.*}}, #99
-; ARM32: mul r{{.*}}, r{{.*}}, r{{.*}}
+; ARM32-OPTM1: movw {{.*}}, #99
+; ARM32-OPTM1: mul r{{.*}}, r{{.*}}, r{{.*}}
+; ARM32-OPT2: rsb [[T:r[0-9]+]], [[S:r[0-9]+]], [[S]], lsl #2
+; ARM32-OPT2-DAG: add [[T]], [[T]], [[S]], lsl #7
+; ARM32-OPT2-DAG: sub [[T]], [[T]], [[S]], lsl #5
 ; MIPS32-LABEL: MulImm
 ; MIPS32: mul