Subzero. ARM32. New bool folding.

Improves the bool folding logic so that branches are short circuited. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1417393003 .

Subzero. ARM32. New bool folding.
7b3d9cbb · John Porto · f2674646 · 7b3d9cbb · 7b3d9cbb · 7b3d9cbb
Commit 7b3d9cbb authored Nov 11, 2015 by John Porto
13 changed files
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -882,7 +882,7 @@ void InstARM32Br::dump(const Cfg *Func) const {
  }

  if (Label) {
-    Str << "label %" << Label->getName(Func);
+    Str << getPredicate() << ", label %" << Label->getName(Func);
  } else {
    Str << getPredicate() << ", label %" << getTargetTrue()->getName();
    if (getTargetFalse()) {

--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -1136,6 +1136,7 @@ public:
  }
  bool isRedundantAssign() const override {
    return !isMultiDest() && !isMultiSource() &&
+           getPredicate() == CondARM32::AL &&
           checkForRedundantAssign(getDest(), getSrc(0));
  }
  bool isVarAssign() const override { return llvm::isa<Variable>(getSrc(0)); }

--- a/src/IceTLS.h
+++ b/src/IceTLS.h
@@ -17,7 +17,6 @@
 #ifndef SUBZERO_SRC_ICETLS_H
 #define SUBZERO_SRC_ICETLS_H

-
 ///
 /// @defgroup /IceTLS Defines 5 macros for unifying thread_local and pthread:
 /// @{
@@ -96,7 +95,6 @@
 #define ICE_ATTRIBUTE_TLS thread_local
 #endif // !_MSC_VER

-
 #define ICE_TLS_DECLARE_FIELD(Type, FieldName)                                 \
  static ICE_ATTRIBUTE_TLS Type FieldName
 #define ICE_TLS_DEFINE_FIELD(Type, ClassName, FieldName)                       \

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -1044,7 +1044,11 @@ void TargetARM32::legalizeMovStackAddrImm(InstARM32Mov *MovInstr,
  }

  if (Legalized) {
-    _mov(Dest, Src);
+    if (MovInstr->isDestRedefined()) {
+      _mov_redefined(Dest, Src, MovInstr->getPredicate());
+    } else {
+      _mov(Dest, Src, MovInstr->getPredicate());
+    }
    MovInstr->setDeleted();
  }
 }
@@ -1346,8 +1350,57 @@ void TargetARM32::lowerIDivRem(Variable *Dest, Variable *T, Variable *Src0R,
  return;
 }

+TargetARM32::SafeBoolChain
+TargetARM32::lowerInt1Arithmetic(const InstArithmetic *Inst) {
+  Variable *Dest = Inst->getDest();
+  assert(Dest->getType() == IceType_i1);
+
+  // So folding didn't work for Inst. Not a problem: We just need to
+  // materialize the Sources, and perform the operation. We create regular
+  // Variables (and not infinite-weight ones) because this call might recurse a
+  // lot, and we might end up with tons of infinite weight temporaries.
+  assert(Inst->getSrcSize() == 2);
+  Variable *Src0 = Func->makeVariable(IceType_i1);
+  SafeBoolChain Src0Safe = lowerInt1(Src0, Inst->getSrc(0));
+
+  Operand *Src1 = Inst->getSrc(1);
+  SafeBoolChain Src1Safe = SBC_Yes;
+
+  if (!llvm::isa<Constant>(Src1)) {
+    Variable *Src1V = Func->makeVariable(IceType_i1);
+    Src1Safe = lowerInt1(Src1V, Src1);
+    Src1 = Src1V;
+  }
+
+  Variable *T = makeReg(IceType_i1);
+  Src0 = legalizeToReg(Src0);
+  Operand *Src1RF = legalize(Src1, Legal_Reg | Legal_Flex);
+  switch (Inst->getOp()) {
+  default:
+    // If this Unreachable is ever executed, add the offending operation to
+    // the list of valid consumers.
+    llvm::report_fatal_error("Unhandled i1 Op");
+  case InstArithmetic::And:
+    _and(T, Src0, Src1RF);
+    break;
+  case InstArithmetic::Or:
+    _orr(T, Src0, Src1RF);
+    break;
+  case InstArithmetic::Xor:
+    _eor(T, Src0, Src1RF);
+    break;
+  }
+  _mov(Dest, T);
+  return Src0Safe == SBC_Yes && Src1Safe == SBC_Yes ? SBC_Yes : SBC_No;
+}
+
 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
  Variable *Dest = Inst->getDest();
+  if (Dest->getType() == IceType_i1) {
+    lowerInt1Arithmetic(Inst);
+    return;
+  }
+
  // TODO(jvoung): Should be able to flip Src0 and Src1 if it is easier to
  // legalize Src0 to flex or Src1 to flex and there is a reversible
  // instruction. E.g., reverse subtract with immediate, register vs register,
@@ -1814,46 +1867,129 @@ void TargetARM32::lowerAssign(const InstAssign *Inst) {
  }
 }

+TargetARM32::ShortCircuitCondAndLabel TargetARM32::lowerInt1ForBranch(
+    Operand *Boolean, const LowerInt1BranchTarget &TargetTrue,
+    const LowerInt1BranchTarget &TargetFalse, uint32_t ShortCircuitable) {
+  InstARM32Label *NewShortCircuitLabel = nullptr;
+  Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
+
+  const Inst *Producer = BoolComputations.getProducerOf(Boolean);
+
+  if (Producer == nullptr) {
+    // No producer, no problem: just do emit code to perform (Boolean & 1) and
+    // set the flags register. The branch should be taken if the resulting flags
+    // indicate a non-zero result.
+    _tst(legalizeToReg(Boolean), _1);
+    return ShortCircuitCondAndLabel(CondWhenTrue(CondARM32::NE));
+  }
+
+  switch (Producer->getKind()) {
+  default:
+    llvm_unreachable("Unexpected producer.");
+  case Inst::Icmp: {
+    return ShortCircuitCondAndLabel(
+        lowerIcmpCond(llvm::cast<InstIcmp>(Producer)));
+  } break;
+  case Inst::Fcmp: {
+    return ShortCircuitCondAndLabel(
+        lowerFcmpCond(llvm::cast<InstFcmp>(Producer)));
+  } break;
+  case Inst::Cast: {
+    const auto *CastProducer = llvm::cast<InstCast>(Producer);
+    assert(CastProducer->getCastKind() == InstCast::Trunc);
+    Operand *Src = CastProducer->getSrc(0);
+    if (Src->getType() == IceType_i64)
+      Src = loOperand(Src);
+    _tst(legalizeToReg(Src), _1);
+    return ShortCircuitCondAndLabel(CondWhenTrue(CondARM32::NE));
+  } break;
+  case Inst::Arithmetic: {
+    const auto *ArithProducer = llvm::cast<InstArithmetic>(Producer);
+    switch (ArithProducer->getOp()) {
+    default:
+      llvm_unreachable("Unhandled Arithmetic Producer.");
+    case InstArithmetic::And: {
+      if (!(ShortCircuitable & SC_And)) {
+        NewShortCircuitLabel = InstARM32Label::create(Func, this);
+      }
+
+      LowerInt1BranchTarget NewTarget =
+          TargetFalse.createForLabelOrDuplicate(NewShortCircuitLabel);
+
+      ShortCircuitCondAndLabel CondAndLabel = lowerInt1ForBranch(
+          Producer->getSrc(0), TargetTrue, NewTarget, SC_And);
+      const CondWhenTrue &Cond = CondAndLabel.Cond;
+
+      _br_short_circuit(NewTarget, Cond.invert());
+
+      InstARM32Label *const ShortCircuitLabel = CondAndLabel.ShortCircuitTarget;
+      if (ShortCircuitLabel != nullptr)
+        Context.insert(ShortCircuitLabel);
+
+      return ShortCircuitCondAndLabel(
+          lowerInt1ForBranch(Producer->getSrc(1), TargetTrue, NewTarget, SC_All)
+              .assertNoLabelAndReturnCond(),
+          NewShortCircuitLabel);
+    } break;
+    case InstArithmetic::Or: {
+      if (!(ShortCircuitable & SC_Or)) {
+        NewShortCircuitLabel = InstARM32Label::create(Func, this);
+      }
+
+      LowerInt1BranchTarget NewTarget =
+          TargetTrue.createForLabelOrDuplicate(NewShortCircuitLabel);
+
+      ShortCircuitCondAndLabel CondAndLabel = lowerInt1ForBranch(
+          Producer->getSrc(0), NewTarget, TargetFalse, SC_Or);
+      const CondWhenTrue &Cond = CondAndLabel.Cond;
+
+      _br_short_circuit(NewTarget, Cond);
+
+      InstARM32Label *const ShortCircuitLabel = CondAndLabel.ShortCircuitTarget;
+      if (ShortCircuitLabel != nullptr)
+        Context.insert(ShortCircuitLabel);
+
+      return ShortCircuitCondAndLabel(lowerInt1ForBranch(Producer->getSrc(1),
+                                                         NewTarget, TargetFalse,
+                                                         SC_All)
+                                          .assertNoLabelAndReturnCond(),
+                                      NewShortCircuitLabel);
+    } break;
+    }
+  }
+  }
+}
+
 void TargetARM32::lowerBr(const InstBr *Instr) {
  if (Instr->isUnconditional()) {
    _br(Instr->getTargetUnconditional());
    return;
  }
-  Operand *Cond = Instr->getCondition();
-
-  CondARM32::Cond BrCondTrue0 = CondARM32::NE;
-  CondARM32::Cond BrCondTrue1 = CondARM32::kNone;
-  CondARM32::Cond BrCondFalse = CondARM32::kNone;
-  if (!_mov_i1_to_flags(Cond, &BrCondTrue0, &BrCondTrue1, &BrCondFalse)) {
-    // "Cond" was not folded.
-    Type Ty = Cond->getType();
-    Variable *Src0R = legalizeToReg(Cond);
-    assert(Ty == IceType_i1);
-    if (Ty != IceType_i32)
-      _uxt(Src0R, Src0R);
-    Constant *_0 = Ctx->getConstantZero(IceType_i32);
-    _cmp(Src0R, _0);
-    BrCondTrue0 = CondARM32::NE;
-  }

-  if (BrCondTrue1 != CondARM32::kNone) {
-    _br(Instr->getTargetTrue(), BrCondTrue1);
-  }
+  CfgNode *TargetTrue = Instr->getTargetTrue();
+  CfgNode *TargetFalse = Instr->getTargetFalse();
+  ShortCircuitCondAndLabel CondAndLabel = lowerInt1ForBranch(
+      Instr->getCondition(), LowerInt1BranchTarget(TargetTrue),
+      LowerInt1BranchTarget(TargetFalse), SC_All);
+  assert(CondAndLabel.ShortCircuitTarget == nullptr);

-  if (BrCondTrue0 == CondARM32::kNone) {
-    assert(BrCondTrue1 == CondARM32::kNone);
-    _br(Instr->getTargetFalse());
-    return;
+  const CondWhenTrue &Cond = CondAndLabel.Cond;
+  if (Cond.WhenTrue1 != CondARM32::kNone) {
+    assert(Cond.WhenTrue0 != CondARM32::AL);
+    _br(TargetTrue, Cond.WhenTrue1);
  }

-  if (BrCondTrue0 == CondARM32::AL) {
-    assert(BrCondTrue1 == CondARM32::kNone);
-    assert(BrCondFalse == CondARM32::kNone);
-    _br(Instr->getTargetTrue());
-    return;
+  switch (Cond.WhenTrue0) {
+  default:
+    _br(TargetTrue, TargetFalse, Cond.WhenTrue0);
+    break;
+  case CondARM32::kNone:
+    _br(TargetFalse);
+    break;
+  case CondARM32::AL:
+    _br(TargetTrue);
+    break;
  }
-
-  _br(Instr->getTargetTrue(), Instr->getTargetFalse(), BrCondTrue0);
 }

 void TargetARM32::lowerCall(const InstCall *Instr) {
@@ -1959,6 +2095,8 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
    case IceType_void:
      break;
    case IceType_i1:
+      assert(BoolComputations.getProducerOf(Dest) == nullptr);
+    // Fall-through intended.
    case IceType_i8:
    case IceType_i16:
    case IceType_i32:
@@ -2089,18 +2227,9 @@ void TargetARM32::lowerCast(const InstCast *Inst) {
        Variable *Src0R = legalizeToReg(Src0);
        _sxt(T_Lo, Src0R);
      } else {
-        CondARM32::Cond CondTrue0, CondTrue1, CondFalse;
-        if (_mov_i1_to_flags(Src0, &CondTrue0, &CondTrue1, &CondFalse)) {
-          // Handle bool folding.
-          Constant *_0 = Ctx->getConstantZero(IceType_i32);
-          Operand *_m1 =
-              legalize(Ctx->getConstantInt32(-1), Legal_Reg | Legal_Flex);
-          _cmov(T_Lo, _m1, CondTrue0, CondTrue1, _0, CondFalse);
-        } else {
-          Variable *Src0R = legalizeToReg(Src0);
-          _lsl(T_Lo, Src0R, ShiftAmt);
-          _asr(T_Lo, T_Lo, ShiftAmt);
-        }
+        Operand *_0 = Ctx->getConstantZero(IceType_i32);
+        Operand *_m1 = Ctx->getConstantInt32(-1);
+        lowerInt1ForSelect(T_Lo, Src0, _m1, _0);
      }
      _mov(DestLo, T_Lo);
      Variable *T_Hi = makeReg(DestHi->getType());
@@ -2119,24 +2248,10 @@ void TargetARM32::lowerCast(const InstCast *Inst) {
      _sxt(T, Src0R);
      _mov(Dest, T);
    } else {
+      Constant *_0 = Ctx->getConstantZero(IceType_i32);
+      Operand *_m1 = Ctx->getConstantInt(Dest->getType(), -1);
      Variable *T = makeReg(Dest->getType());
-      CondARM32::Cond CondTrue0, CondTrue1, CondFalse;
-      if (_mov_i1_to_flags(Src0, &CondTrue0, &CondTrue1, &CondFalse)) {
-        // Handle bool folding.
-        Constant *_0 = Ctx->getConstantZero(IceType_i32);
-        Operand *_m1 =
-            legalize(Ctx->getConstantInt32(-1), Legal_Reg | Legal_Flex);
-        _cmov(T, _m1, CondTrue0, CondTrue1, _0, CondFalse);
-      } else {
-        // GPR registers are 32-bit, so just use 31 as dst_bitwidth - 1.
-        // lsl t1, src_reg, 31
-        // asr t1, t1, 31
-        // dst = t1
-        Variable *Src0R = legalizeToReg(Src0);
-        Constant *ShiftAmt = Ctx->getConstantInt32(31);
-        _lsl(T, Src0R, ShiftAmt);
-        _asr(T, T, ShiftAmt);
-      }
+      lowerInt1ForSelect(T, Src0, _m1, _0);
      _mov(Dest, T);
    }
    break;
@@ -2149,60 +2264,44 @@ void TargetARM32::lowerCast(const InstCast *Inst) {
      UnimplementedError(Func->getContext()->getFlags());
    } else if (Dest->getType() == IceType_i64) {
      // t1=uxtb src; dst.lo=t1; dst.hi=0
-      Constant *_0 = Ctx->getConstantZero(IceType_i32);
-      Constant *_1 = Ctx->getConstantInt32(1);
+      Operand *_0 =
+          legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
      Variable *T_Lo = makeReg(DestLo->getType());

-      CondARM32::Cond CondTrue0, CondTrue1, CondFalse;
-      if (_mov_i1_to_flags(Src0, &CondTrue0, &CondTrue1, &CondFalse)) {
-        // Handle folding opportunities.
-        Variable *T_Hi = makeReg(DestLo->getType());
-        _mov(T_Hi, _0);
-        _mov(DestHi, T_Hi);
-        _cmov(T_Lo, _1, CondTrue0, CondTrue1, _0, CondFalse);
-        _mov(DestLo, T_Lo);
-        return;
+      switch (Src0->getType()) {
+      default: {
+        assert(Src0->getType() != IceType_i64);
+        _uxt(T_Lo, legalizeToReg(Src0));
+      } break;
+      case IceType_i32: {
+        _mov(T_Lo, legalize(Src0, Legal_Reg | Legal_Flex));
+      } break;
+      case IceType_i1: {
+        SafeBoolChain Safe = lowerInt1(T_Lo, Src0);
+        if (Safe == SBC_No) {
+          Operand *_1 =
+              legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
+          _and(T_Lo, T_Lo, _1);
+        }
+      } break;
      }

-      // i32 and i1 can just take up the whole register. i32 doesn't need uxt,
-      // while i1 will have an and mask later anyway.
-      if (Src0->getType() == IceType_i32 || Src0->getType() == IceType_i1) {
-        Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
-        _mov(T_Lo, Src0RF);
-      } else {
-        Variable *Src0R = legalizeToReg(Src0);
-        _uxt(T_Lo, Src0R);
-      }
-      if (Src0->getType() == IceType_i1) {
-        Constant *One = Ctx->getConstantInt32(1);
-        _and(T_Lo, T_Lo, One);
-      }
      _mov(DestLo, T_Lo);
+
      Variable *T_Hi = makeReg(DestLo->getType());
      _mov(T_Hi, _0);
      _mov(DestHi, T_Hi);
    } else if (Src0->getType() == IceType_i1) {
-      Constant *_1 = Ctx->getConstantInt32(1);
      Variable *T = makeReg(Dest->getType());

-      CondARM32::Cond CondTrue0, CondTrue1, CondFalse;
-      if (_mov_i1_to_flags(Src0, &CondTrue0, &CondTrue1, &CondFalse)) {
-        // Handle folding opportunities.
-        Constant *_0 = Ctx->getConstantZero(IceType_i32);
-        _cmov(T, _1, CondTrue0, CondTrue1, _0, CondFalse);
-        _mov(Dest, T);
-        return;
+      SafeBoolChain Safe = lowerInt1(T, Src0);
+      if (Safe == SBC_No) {
+        Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
+        _and(T, T, _1);
      }

-      // t = Src0; t &= 1; Dest = t
-      Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
-      // Just use _mov instead of _uxt since all registers are 32-bit. _uxt
-      // requires the source to be a register so could have required a _mov
-      // from legalize anyway.
-      _mov(T, Src0RF);
-      _and(T, T, _1);
      _mov(Dest, T);
    } else {
      // t1 = uxt src; dst = t1
@@ -2473,19 +2572,13 @@ struct {
 };
 } // end of anonymous namespace

-void TargetARM32::lowerFcmpCond(const InstFcmp *Instr,
-                                CondARM32::Cond *CondIfTrue0,
-                                CondARM32::Cond *CondIfTrue1,
-                                CondARM32::Cond *CondIfFalse) {
+TargetARM32::CondWhenTrue TargetARM32::lowerFcmpCond(const InstFcmp *Instr) {
  InstFcmp::FCond Condition = Instr->getCondition();
  switch (Condition) {
  case InstFcmp::False:
-    *CondIfFalse = CondARM32::AL;
-    *CondIfTrue0 = *CondIfTrue1 = CondARM32::kNone;
-    break;
+    return CondWhenTrue(CondARM32::kNone);
  case InstFcmp::True:
-    *CondIfFalse = *CondIfTrue1 = CondARM32::kNone;
-    *CondIfTrue0 = CondARM32::AL;
+    return CondWhenTrue(CondARM32::AL);
    break;
  default: {
    Variable *Src0R = legalizeToReg(Instr->getSrc(0));
@@ -2493,11 +2586,7 @@ void TargetARM32::lowerFcmpCond(const InstFcmp *Instr,
    _vcmp(Src0R, Src1R);
    _vmrs();
    assert(Condition < llvm::array_lengthof(TableFcmp));
-    *CondIfTrue0 = TableFcmp[Condition].CC0;
-    *CondIfTrue1 = TableFcmp[Condition].CC1;
-    *CondIfFalse = (*CondIfTrue1 != CondARM32::kNone)
-                       ? CondARM32::AL
-                       : InstARM32::getOppositeCondition(*CondIfTrue0);
+    return CondWhenTrue(TableFcmp[Condition].CC0, TableFcmp[Condition].CC1);
  }
  }
 }
@@ -2513,39 +2602,40 @@ void TargetARM32::lowerFcmp(const InstFcmp *Instr) {
  }

  Variable *T = makeReg(IceType_i1);
-  Operand *_1 = Ctx->getConstantInt32(1);
-  Operand *_0 = Ctx->getConstantZero(IceType_i32);
+  Operand *_1 = legalize(Ctx->getConstantInt32(1), Legal_Reg | Legal_Flex);
+  Operand *_0 =
+      legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);

-  CondARM32::Cond CondIfTrue0, CondIfTrue1, CondIfFalse;
-  lowerFcmpCond(Instr, &CondIfTrue0, &CondIfTrue1, &CondIfFalse);
+  CondWhenTrue Cond = lowerFcmpCond(Instr);

  bool RedefineT = false;
-  if (CondIfFalse != CondARM32::kNone) {
-    assert(!RedefineT);
-    _mov(T, _0, CondIfFalse);
+  if (Cond.WhenTrue0 != CondARM32::AL) {
+    _mov(T, _0);
    RedefineT = true;
  }

-  if (CondIfTrue0 != CondARM32::kNone) {
-    if (RedefineT) {
-      _mov_redefined(T, _1, CondIfTrue0);
-    } else {
-      _mov(T, _1, CondIfTrue0);
-    }
-    RedefineT = true;
+  if (Cond.WhenTrue0 == CondARM32::kNone) {
+    _mov(Dest, T);
+    return;
+  }
+
+  if (RedefineT) {
+    _mov_redefined(T, _1, Cond.WhenTrue0);
+  } else {
+    _mov(T, _1, Cond.WhenTrue0);
  }

-  if (CondIfTrue1 != CondARM32::kNone) {
-    assert(RedefineT);
-    _mov_redefined(T, _1, CondIfTrue1);
+  if (Cond.WhenTrue1 != CondARM32::kNone) {
+    _mov_redefined(T, _1, Cond.WhenTrue1);
  }

  _mov(Dest, T);
 }

-void TargetARM32::lowerIcmpCond(const InstIcmp *Inst,
-                                CondARM32::Cond *CondIfTrue,
-                                CondARM32::Cond *CondIfFalse) {
+TargetARM32::CondWhenTrue TargetARM32::lowerIcmpCond(const InstIcmp *Inst) {
+  assert(Inst->getSrc(0)->getType() != IceType_i1);
+  assert(Inst->getSrc(1)->getType() != IceType_i1);
+
  Operand *Src0 = legalizeUndef(Inst->getSrc(0));
  Operand *Src1 = legalizeUndef(Inst->getSrc(1));

@@ -2607,9 +2697,7 @@ void TargetARM32::lowerIcmpCond(const InstIcmp *Inst,
      _cmp(Src0Hi, Src1HiRF);
      _cmp(Src0Lo, Src1LoRF, CondARM32::EQ);
    }
-    *CondIfTrue = TableIcmp64[Index].C1;
-    *CondIfFalse = TableIcmp64[Index].C2;
-    return;
+    return CondWhenTrue(TableIcmp64[Index].C1);
  }

  // a=icmp cond b, c ==>
@@ -2661,8 +2749,7 @@ void TargetARM32::lowerIcmpCond(const InstIcmp *Inst,
    Operand *Src1RF = legalize(Src1, Legal_Reg | Legal_Flex);
    _cmp(Src0R, Src1RF);
  }
-  *CondIfTrue = getIcmp32Mapping(Inst->getCondition());
-  *CondIfFalse = InstARM32::getOppositeCondition(*CondIfTrue);
+  return CondWhenTrue(getIcmp32Mapping(Inst->getCondition()));
 }

 void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
@@ -2676,17 +2763,18 @@ void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
    return;
  }

-  Constant *_0 = Ctx->getConstantZero(IceType_i32);
-  Constant *_1 = Ctx->getConstantInt32(1);
+  Operand *_0 =
+      legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
+  Operand *_1 = legalize(Ctx->getConstantInt32(1), Legal_Reg | Legal_Flex);
  Variable *T = makeReg(IceType_i1);

-  CondARM32::Cond CondIfTrue, CondIfFalse;
-  lowerIcmpCond(Inst, &CondIfTrue, &CondIfFalse);
-
-  _mov(T, _0, CondIfFalse);
-  _mov_redefined(T, _1, CondIfTrue);
+  _mov(T, _0);
+  CondWhenTrue Cond = lowerIcmpCond(Inst);
+  _mov_redefined(T, _1, Cond.WhenTrue0);
  _mov(Dest, T);

+  assert(Cond.WhenTrue1 == CondARM32::kNone);
+
  return;
 }

@@ -3903,119 +3991,7 @@ void TargetARM32::lowerSelect(const InstSelect *Inst) {
    return;
  }

-  CondARM32::Cond CondIfTrue0, CondIfTrue1, CondIfFalse;
-  if (!_mov_i1_to_flags(Condition, &CondIfTrue0, &CondIfTrue1, &CondIfFalse)) {
-    // "Condition" was not fold.
-    // cmp cond, #0; mov t, SrcF; mov_cond t, SrcT; mov dest, t
-    Variable *CmpOpnd0 = legalizeToReg(Condition);
-    Type CmpOpnd0Ty = CmpOpnd0->getType();
-    Operand *CmpOpnd1 = Ctx->getConstantZero(IceType_i32);
-    assert(CmpOpnd0Ty == IceType_i1);
-    if (CmpOpnd0Ty != IceType_i32)
-      _uxt(CmpOpnd0, CmpOpnd0);
-    _cmp(CmpOpnd0, CmpOpnd1);
-    CondIfTrue0 = CondARM32::NE;
-    CondIfTrue1 = CondARM32::kNone;
-    CondIfFalse = CondARM32::EQ;
-  }
-
-  if (DestTy == IceType_i64) {
-    SrcT = legalizeUndef(SrcT);
-    SrcF = legalizeUndef(SrcF);
-    // Set the low portion.
-    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    Operand *SrcTLo = legalize(loOperand(SrcT), Legal_Reg | Legal_Flex);
-    Operand *SrcFLo = legalize(loOperand(SrcF), Legal_Reg | Legal_Flex);
-    Variable *TLo = makeReg(SrcFLo->getType());
-    bool RedefineTLo = false;
-    if (CondIfFalse != CondARM32::kNone) {
-      _mov(TLo, SrcFLo, CondIfFalse);
-      RedefineTLo = true;
-    }
-    if (CondIfTrue0 != CondARM32::kNone) {
-      if (!RedefineTLo)
-        _mov(TLo, SrcTLo, CondIfTrue0);
-      else
-        _mov_redefined(TLo, SrcTLo, CondIfTrue0);
-      RedefineTLo = true;
-    }
-    if (CondIfTrue1 != CondARM32::kNone) {
-      assert(RedefineTLo);
-      _mov_redefined(TLo, SrcTLo, CondIfTrue1);
-    }
-    _mov(DestLo, TLo);
-
-    // Set the high portion.
-    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    Operand *SrcTHi = legalize(hiOperand(SrcT), Legal_Reg | Legal_Flex);
-    Operand *SrcFHi = legalize(hiOperand(SrcF), Legal_Reg | Legal_Flex);
-    Variable *THi = makeReg(SrcFHi->getType());
-    bool RedefineTHi = false;
-    if (CondIfFalse != CondARM32::kNone) {
-      _mov(THi, SrcFHi, CondIfFalse);
-      RedefineTHi = true;
-    }
-    if (CondIfTrue0 != CondARM32::kNone) {
-      if (!RedefineTHi)
-        _mov(THi, SrcTHi, CondIfTrue0);
-      else
-        _mov_redefined(THi, SrcTHi, CondIfTrue0);
-      RedefineTHi = true;
-    }
-    if (CondIfTrue1 != CondARM32::kNone) {
-      assert(RedefineTHi);
-      _mov_redefined(THi, SrcTHi, CondIfTrue1);
-    }
-    _mov(DestHi, THi);
-    return;
-  }
-
-  if (isFloatingType(DestTy)) {
-    SrcT = legalizeToReg(SrcT);
-    SrcF = legalizeToReg(SrcF);
-    Variable *T = makeReg(DestTy);
-    assert(DestTy == SrcF->getType());
-    bool RedefineT = false;
-    if (CondIfFalse != CondARM32::kNone) {
-      _mov(T, SrcF, CondIfFalse);
-      RedefineT = true;
-    }
-    if (CondIfTrue0 != CondARM32::kNone) {
-      if (!RedefineT)
-        _mov(T, SrcT, CondIfTrue0);
-      else
-        _mov_redefined(T, SrcT, CondIfTrue0);
-      RedefineT = true;
-    }
-    if (CondIfTrue1 != CondARM32::kNone) {
-      assert(RedefineT);
-      _mov_redefined(T, SrcT, CondIfTrue1);
-    }
-    assert(DestTy == SrcT->getType());
-    _mov(Dest, T);
-    return;
-  }
-
-  Variable *T = makeReg(SrcF->getType());
-  SrcT = legalize(SrcT, Legal_Reg | Legal_Flex);
-  SrcF = legalize(SrcF, Legal_Reg | Legal_Flex);
-  bool RedefineT = false;
-  if (CondIfFalse != CondARM32::kNone) {
-    _mov(T, SrcF, CondIfFalse);
-    RedefineT = true;
-  }
-  if (CondIfTrue0 != CondARM32::kNone) {
-    if (!RedefineT)
-      _mov(T, SrcT, CondIfTrue0);
-    else
-      _mov_redefined(T, SrcT, CondIfTrue0);
-    RedefineT = true;
-  }
-  if (CondIfTrue1 != CondARM32::kNone) {
-    assert(RedefineT);
-    _mov_redefined(T, SrcT, CondIfTrue1);
-  }
-  _mov(Dest, T);
+  lowerInt1ForSelect(Dest, Condition, legalizeUndef(SrcT), legalizeUndef(SrcF));
 }

 void TargetARM32::lowerStore(const InstStore *Inst) {
@@ -4430,74 +4406,250 @@ void TargetARM32::emit(const ConstantUndef *) const {
  llvm::report_fatal_error("undef value encountered by emitter.");
 }

-void TargetARM32::lowerTruncToFlags(Operand *Src, CondARM32::Cond *CondIfTrue,
-                                    CondARM32::Cond *CondIfFalse) {
-  Operand *_1 = Ctx->getConstantInt32(1);
-  Variable *SrcR =
-      legalizeToReg(Src->getType() == IceType_i64 ? loOperand(Src) : Src);
-  _tst(SrcR, _1);
-  *CondIfTrue = CondARM32::NE;  // NE <-> APSR.Z == 0
-  *CondIfFalse = CondARM32::EQ; // EQ <-> APSR.Z == 1
+void TargetARM32::lowerInt1ForSelect(Variable *Dest, Operand *Boolean,
+                                     Operand *TrueValue, Operand *FalseValue) {
+  Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
+
+  assert(Boolean->getType() == IceType_i1);
+
+  bool NeedsAnd1 = false;
+  if (TrueValue->getType() == IceType_i1) {
+    assert(FalseValue->getType() == IceType_i1);
+
+    Variable *TrueValueV = Func->makeVariable(IceType_i1);
+    SafeBoolChain Src0Safe = lowerInt1(TrueValueV, TrueValue);
+    TrueValue = TrueValueV;
+
+    Variable *FalseValueV = Func->makeVariable(IceType_i1);
+    SafeBoolChain Src1Safe = lowerInt1(FalseValueV, FalseValue);
+    FalseValue = FalseValueV;
+
+    NeedsAnd1 = Src0Safe == SBC_No || Src1Safe == SBC_No;
+  }
+
+  Variable *DestLo = (Dest->getType() == IceType_i64)
+                         ? llvm::cast<Variable>(loOperand(Dest))
+                         : Dest;
+  Variable *DestHi = (Dest->getType() == IceType_i64)
+                         ? llvm::cast<Variable>(hiOperand(Dest))
+                         : nullptr;
+  Operand *FalseValueLo = (FalseValue->getType() == IceType_i64)
+                              ? loOperand(FalseValue)
+                              : FalseValue;
+  Operand *FalseValueHi =
+      (FalseValue->getType() == IceType_i64) ? hiOperand(FalseValue) : nullptr;
+
+  Operand *TrueValueLo =
+      (TrueValue->getType() == IceType_i64) ? loOperand(TrueValue) : TrueValue;
+  Operand *TrueValueHi =
+      (TrueValue->getType() == IceType_i64) ? hiOperand(TrueValue) : nullptr;
+
+  Variable *T_Lo = makeReg(DestLo->getType());
+  Variable *T_Hi = (DestHi == nullptr) ? nullptr : makeReg(DestHi->getType());
+
+  _mov(T_Lo, legalize(FalseValueLo, Legal_Reg | Legal_Flex));
+  if (DestHi) {
+    _mov(T_Hi, legalize(FalseValueHi, Legal_Reg | Legal_Flex));
+  }
+
+  CondWhenTrue Cond(CondARM32::kNone);
+  // FlagsWereSet is used to determine wether Boolean was folded or not. If not,
+  // add an explicit _tst instruction below.
+  bool FlagsWereSet = false;
+  if (const Inst *Producer = BoolComputations.getProducerOf(Boolean)) {
+    switch (Producer->getKind()) {
+    default:
+      llvm_unreachable("Unexpected producer.");
+    case Inst::Icmp: {
+      Cond = lowerIcmpCond(llvm::cast<InstIcmp>(Producer));
+      FlagsWereSet = true;
+    } break;
+    case Inst::Fcmp: {
+      Cond = lowerFcmpCond(llvm::cast<InstFcmp>(Producer));
+      FlagsWereSet = true;
+    } break;
+    case Inst::Cast: {
+      const auto *CastProducer = llvm::cast<InstCast>(Producer);
+      assert(CastProducer->getCastKind() == InstCast::Trunc);
+      Boolean = CastProducer->getSrc(0);
+      // No flags were set, so a _tst(Src, 1) will be emitted below. Don't
+      // bother legalizing Src to a Reg because it will be legalized before
+      // emitting the tst instruction.
+      FlagsWereSet = false;
+    } break;
+    case Inst::Arithmetic: {
+      // This is a special case: we eagerly assumed Producer could be folded,
+      // but in reality, it can't. No reason to panic: we just lower it using
+      // the regular lowerArithmetic helper.
+      const auto *ArithProducer = llvm::cast<InstArithmetic>(Producer);
+      lowerArithmetic(ArithProducer);
+      Boolean = ArithProducer->getDest();
+      // No flags were set, so a _tst(Dest, 1) will be emitted below. Don't
+      // bother legalizing Dest to a Reg because it will be legalized before
+      // emitting  the tst instruction.
+      FlagsWereSet = false;
+    } break;
+    }
+  }
+
+  if (!FlagsWereSet) {
+    // No flags have been set, so emit a tst Boolean, 1.
+    Variable *Src = legalizeToReg(Boolean);
+    _tst(Src, _1);
+    Cond = CondWhenTrue(CondARM32::NE); // i.e., CondARM32::NotZero.
+  }
+
+  if (Cond.WhenTrue0 == CondARM32::kNone) {
+    assert(Cond.WhenTrue1 == CondARM32::kNone);
+  } else {
+    _mov_redefined(T_Lo, legalize(TrueValueLo, Legal_Reg | Legal_Flex),
+                   Cond.WhenTrue0);
+    if (DestHi) {
+      _mov_redefined(T_Hi, legalize(TrueValueHi, Legal_Reg | Legal_Flex),
+                     Cond.WhenTrue0);
+    }
+  }
+
+  if (Cond.WhenTrue1 != CondARM32::kNone) {
+    _mov_redefined(T_Lo, legalize(TrueValueLo, Legal_Reg | Legal_Flex),
+                   Cond.WhenTrue1);
+    if (DestHi) {
+      _mov_redefined(T_Hi, legalize(TrueValueHi, Legal_Reg | Legal_Flex),
+                     Cond.WhenTrue1);
+    }
+  }
+
+  if (NeedsAnd1) {
+    // We lowered something that is unsafe (i.e., can't provably be zero or
+    // one). Truncate the result.
+    _and(T_Lo, T_Lo, _1);
+  }
+
+  _mov(DestLo, T_Lo);
+  if (DestHi) {
+    _mov(DestHi, T_Hi);
+  }
 }

-bool TargetARM32::_mov_i1_to_flags(Operand *Boolean,
-                                   CondARM32::Cond *CondIfTrue0,
-                                   CondARM32::Cond *CondIfTrue1,
-                                   CondARM32::Cond *CondIfFalse) {
-  *CondIfTrue0 = CondARM32::kNone;
-  *CondIfTrue1 = CondARM32::kNone;
-  *CondIfFalse = CondARM32::AL;
-  bool FoldOK = false;
+TargetARM32::SafeBoolChain TargetARM32::lowerInt1(Variable *Dest,
+                                                  Operand *Boolean) {
+  assert(Boolean->getType() == IceType_i1);
+  Variable *T = makeReg(IceType_i1);
+  Operand *_0 =
+      legalize(Ctx->getConstantZero(IceType_i1), Legal_Reg | Legal_Flex);
+  Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
+
+  SafeBoolChain Safe = SBC_Yes;
  if (const Inst *Producer = BoolComputations.getProducerOf(Boolean)) {
-    if (const auto *IcmpProducer = llvm::dyn_cast<InstIcmp>(Producer)) {
-      lowerIcmpCond(IcmpProducer, CondIfTrue0, CondIfFalse);
-      FoldOK = true;
-    } else if (const auto *FcmpProducer = llvm::dyn_cast<InstFcmp>(Producer)) {
-      lowerFcmpCond(FcmpProducer, CondIfTrue0, CondIfTrue1, CondIfFalse);
-      FoldOK = true;
-    } else if (const auto *CastProducer = llvm::dyn_cast<InstCast>(Producer)) {
+    switch (Producer->getKind()) {
+    default:
+      llvm_unreachable("Unexpected producer.");
+    case Inst::Icmp: {
+      _mov(T, _0);
+      CondWhenTrue Cond = lowerIcmpCond(llvm::cast<InstIcmp>(Producer));
+      assert(Cond.WhenTrue0 != CondARM32::AL);
+      assert(Cond.WhenTrue0 != CondARM32::kNone);
+      assert(Cond.WhenTrue1 == CondARM32::kNone);
+      _mov_redefined(T, _1, Cond.WhenTrue0);
+    } break;
+    case Inst::Fcmp: {
+      _mov(T, _0);
+      Inst *MovZero = Context.getLastInserted();
+      CondWhenTrue Cond = lowerFcmpCond(llvm::cast<InstFcmp>(Producer));
+      if (Cond.WhenTrue0 == CondARM32::AL) {
+        assert(Cond.WhenTrue1 == CondARM32::kNone);
+        MovZero->setDeleted();
+        _mov(T, _1);
+      } else if (Cond.WhenTrue0 != CondARM32::kNone) {
+        _mov_redefined(T, _1, Cond.WhenTrue0);
+      }
+      if (Cond.WhenTrue1 != CondARM32::kNone) {
+        assert(Cond.WhenTrue0 != CondARM32::kNone);
+        assert(Cond.WhenTrue0 != CondARM32::AL);
+        _mov_redefined(T, _1, Cond.WhenTrue1);
+      }
+    } break;
+    case Inst::Cast: {
+      const auto *CastProducer = llvm::cast<InstCast>(Producer);
      assert(CastProducer->getCastKind() == InstCast::Trunc);
-      lowerTruncToFlags(CastProducer->getSrc(0), CondIfTrue0, CondIfFalse);
-      FoldOK = true;
+      Operand *Src = CastProducer->getSrc(0);
+      if (Src->getType() == IceType_i64)
+        Src = loOperand(Src);
+      _mov(T, legalize(Src, Legal_Reg | Legal_Flex));
+      Safe = SBC_No;
+    } break;
+    case Inst::Arithmetic: {
+      const auto *ArithProducer = llvm::cast<InstArithmetic>(Producer);
+      Safe = lowerInt1Arithmetic(ArithProducer);
+      _mov(T, ArithProducer->getDest());
+    } break;
    }
+  } else {
+    _mov(T, legalize(Boolean, Legal_Reg | Legal_Flex));
  }
-  return FoldOK;
+
+  _mov(Dest, T);
+  return Safe;
 }

 namespace {
 namespace BoolFolding {
 bool shouldTrackProducer(const Inst &Instr) {
-  switch (static_cast<uint32_t>(Instr.getKind())) {
+  switch (Instr.getKind()) {
+  default:
+    return false;
  case Inst::Icmp:
-    return true;
  case Inst::Fcmp:
    return true;
-  }
-  if (const auto *Cast = llvm::dyn_cast<InstCast>(&Instr)) {
-    switch (static_cast<uint32_t>(Cast->getCastKind())) {
+  case Inst::Cast: {
+    switch (llvm::cast<InstCast>(&Instr)->getCastKind()) {
+    default:
+      return false;
    case InstCast::Trunc:
      return true;
    }
  }
-  return false;
+  case Inst::Arithmetic: {
+    switch (llvm::cast<InstArithmetic>(&Instr)->getOp()) {
+    default:
+      return false;
+    case InstArithmetic::And:
+    case InstArithmetic::Or:
+      return true;
+    }
+  }
+  }
 }

 bool isValidConsumer(const Inst &Instr) {
-  switch (static_cast<uint32_t>(Instr.getKind())) {
+  switch (Instr.getKind()) {
+  default:
+    return false;
  case Inst::Br:
    return true;
  case Inst::Select:
    return !isVectorType(Instr.getDest()->getType());
-  }
-  if (const auto *Cast = llvm::dyn_cast<InstCast>(&Instr)) {
-    switch (static_cast<uint32_t>(Cast->getCastKind())) {
+  case Inst::Cast: {
+    switch (llvm::cast<InstCast>(&Instr)->getCastKind()) {
+    default:
+      return false;
    case InstCast::Sext:
      return !isVectorType(Instr.getDest()->getType());
    case InstCast::Zext:
      return !isVectorType(Instr.getDest()->getType());
    }
  }
-  return false;
+  case Inst::Arithmetic: {
+    switch (llvm::cast<InstArithmetic>(&Instr)->getOp()) {
+    default:
+      return false;
+    case InstArithmetic::And:
+      return !isVectorType(Instr.getDest()->getType());
+    case InstArithmetic::Or:
+      return !isVectorType(Instr.getDest()->getType());
+    }
+  }
+  }
 }
 } // end of namespace BoolFolding
 } // end of anonymous namespace
@@ -4520,9 +4672,8 @@ void TargetARM32::BoolComputationTracker::recordProducers(CfgNode *Node) {
        continue;
      }

-      if (IndexOfVarOperandInInst(Var) != 0 ||
-          !BoolFolding::isValidConsumer(Instr)) {
-        // All valid consumers use Var as the first source operand
+      ++ComputationIter->second.NumUses;
+      if (!BoolFolding::isValidConsumer(Instr)) {
        KnownComputations.erase(VarNum);
        continue;
      }
@@ -4536,7 +4687,7 @@ void TargetARM32::BoolComputationTracker::recordProducers(CfgNode *Node) {
  for (auto Iter = KnownComputations.begin(), End = KnownComputations.end();
       Iter != End;) {
    // Disable the folding if its dest may be live beyond this block.
-    if (Iter->second.IsLiveOut) {
+    if (Iter->second.IsLiveOut || Iter->second.NumUses > 1) {
      Iter = KnownComputations.erase(Iter);
      continue;
    }

--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -135,19 +135,52 @@ protected:

  void postLower() override;

+  enum SafeBoolChain {
+    SBC_No,
+    SBC_Yes,
+  };
+
  void lowerAlloca(const InstAlloca *Inst) override;
+  SafeBoolChain lowerInt1Arithmetic(const InstArithmetic *Inst);
  void lowerArithmetic(const InstArithmetic *Inst) override;
  void lowerAssign(const InstAssign *Inst) override;
  void lowerBr(const InstBr *Inst) override;
  void lowerCall(const InstCall *Inst) override;
  void lowerCast(const InstCast *Inst) override;
  void lowerExtractElement(const InstExtractElement *Inst) override;
-  void lowerFcmpCond(const InstFcmp *Instr, CondARM32::Cond *CondIfTrue0,
-                     CondARM32::Cond *CondIfTrue1,
-                     CondARM32::Cond *CondIfFalse);
+
+  /// CondWhenTrue is a helper type returned by every method in the lowering
+  /// that emits code to set the condition codes.
+  class CondWhenTrue {
+  public:
+    explicit CondWhenTrue(CondARM32::Cond T0,
+                          CondARM32::Cond T1 = CondARM32::kNone)
+        : WhenTrue0(T0), WhenTrue1(T1) {
+      assert(T1 == CondARM32::kNone || T0 != CondARM32::kNone);
+      assert(T1 != T0 || T0 == CondARM32::kNone);
+    }
+    CondARM32::Cond WhenTrue0;
+    CondARM32::Cond WhenTrue1;
+
+    /// invert returns a new object with WhenTrue0 and WhenTrue1 inverted.
+    CondWhenTrue invert() const {
+      switch (WhenTrue0) {
+      default:
+        if (WhenTrue1 == CondARM32::kNone)
+          return CondWhenTrue(InstARM32::getOppositeCondition(WhenTrue0));
+        return CondWhenTrue(InstARM32::getOppositeCondition(WhenTrue0),
+                            InstARM32::getOppositeCondition(WhenTrue1));
+      case CondARM32::AL:
+        return CondWhenTrue(CondARM32::kNone);
+      case CondARM32::kNone:
+        return CondWhenTrue(CondARM32::AL);
+      }
+    }
+  };
+
+  CondWhenTrue lowerFcmpCond(const InstFcmp *Instr);
  void lowerFcmp(const InstFcmp *Instr) override;
-  void lowerIcmpCond(const InstIcmp *Instr, CondARM32::Cond *CondIfTrue,
-                     CondARM32::Cond *CondIfFalse);
+  CondWhenTrue lowerIcmpCond(const InstIcmp *Instr);
  void lowerIcmp(const InstIcmp *Instr) override;
  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
                      Operand *Val);
@@ -334,58 +367,232 @@ protected:
    }
  }

-  // _mov_i1_to_flags is used for bool folding. If "Boolean" is folded, this
-  // method returns true, and sets "CondIfTrue0" and "CondIfTrue1" to the
-  // appropriate ARM condition codes. If "Boolean" is not to be folded, then
-  // this method returns false.
-  bool _mov_i1_to_flags(Operand *Boolean, CondARM32::Cond *CondIfTrue0,
-                        CondARM32::Cond *CondIfTrue1,
-                        CondARM32::Cond *CondIfFalse);
-
-  // _cmov is a pseudo instruction that is used for boolean folding. It emits
-  // code that moves "SrcIfTrue" to dest if either "CondIfTrue0" or
-  // "CondIfTrue1" holds, and "SrcIfFalse", if "CondIfFalse" holds. It requires
-  // "Dest" to be an infinite-weight temporary.
-  void _cmov(Variable *Dest, Operand *SrcIfTrue, CondARM32::Cond CondIfTrue0,
-             CondARM32::Cond CondIfTrue1, Operand *SrcIfFalse,
-             CondARM32::Cond CondIfFalse) {
-    assert(Dest->mustHaveReg());
-
-    if (CondIfFalse == CondARM32::kNone) {
-      assert(CondIfTrue0 == CondARM32::AL);
-      assert(CondIfTrue1 == CondARM32::kNone);
+  // --------------------------------------------------------------------------
+  // Begin bool folding machinery.
+  //
+  // There are three types of boolean lowerings handled by this target:
+  //
+  // 1) Boolean expressions leading to a boolean Variable definition
+  // ---------------------------------------------------------------
+  //
+  // Whenever a i1 Variable is live out (i.e., its live range extends beyond
+  // the defining basic block) we do not fold the operation. We instead
+  // materialize (i.e., compute) the variable normally, so that it can be used
+  // when needed. We also materialize i1 values that are not single use to
+  // avoid code duplication. These expressions are not short circuited.
+  //
+  // 2) Boolean expressions leading to a select
+  // ------------------------------------------
+  //
+  // These include boolean chains leading to a select instruction, as well as
+  // i1 Sexts. These boolean expressions are lowered to:
+  //
+  // mov T, <false value>
+  // CC <- eval(Boolean Expression)
+  // movCC T, <true value>
+  //
+  // For Sexts, <false value> is 0, and <true value> is -1.
+  //
+  // 3) Boolean expressions leading to a br i1
+  // -----------------------------------------
+  //
+  // These are the boolean chains leading to a branch. These chains are
+  // short-circuited, i.e.:
+  //
+  //   A = or i1 B, C
+  //   br i1 A, label %T, label %F
+  //
+  // becomes
+  //
+  //   tst B
+  //   jne %T
+  //   tst B
+  //   jne %T
+  //   j %F
+  //
+  // and
+  //
+  //   A = and i1 B, C
+  //   br i1 A, label %T, label %F
+  //
+  // becomes
+  //
+  //   tst B
+  //   jeq %F
+  //   tst B
+  //   jeq %F
+  //   j %T
+  //
+  // Arbitrarily long chains are short circuited, e.g
+  //
+  //   A = or  i1 B, C
+  //   D = and i1 A, E
+  //   F = and i1 G, H
+  //   I = or i1 D, F
+  //   br i1 I, label %True, label %False
+  //
+  // becomes
+  //
+  // Label[A]:
+  //   tst B, 1
+  //   bne Label[D]
+  //   tst C, 1
+  //   beq Label[I]
+  // Label[D]:
+  //   tst E, 1
+  //   bne %True
+  // Label[I]
+  //   tst G, 1
+  //   beq %False
+  //   tst H, 1
+  //   beq %False (bne %True)
+
+  /// lowerInt1 materializes Boolean to a Variable.
+  SafeBoolChain lowerInt1(Variable *Dest, Operand *Boolean);
+
+  /// lowerInt1ForSelect generates the following instruction sequence:
+  ///
+  ///   mov T, FalseValue
+  ///   CC <- eval(Boolean)
+  ///   movCC T, TrueValue
+  ///   mov Dest, T
+  ///
+  /// It is used for lowering select i1, as well as i1 Sext.
+  void lowerInt1ForSelect(Variable *Dest, Operand *Boolean, Operand *TrueValue,
+                          Operand *FalseValue);
+
+  /// LowerInt1BranchTarget is used by lowerIntForBranch. It wraps a CfgNode, or
+  /// an InstARM32Label (but never both) so that, during br i1 lowering, we can
+  /// create auxiliary labels for short circuiting the condition evaluation.
+  class LowerInt1BranchTarget {
+  public:
+    explicit LowerInt1BranchTarget(CfgNode *const Target)
+        : NodeTarget(Target) {}
+    explicit LowerInt1BranchTarget(InstARM32Label *const Target)
+        : LabelTarget(Target) {}
+
+    /// createForLabelOrDuplicate will return a new LowerInt1BranchTarget that
+    /// is the exact copy of this if Label is nullptr; otherwise, the returned
+    /// object will wrap Label instead.
+    LowerInt1BranchTarget
+    createForLabelOrDuplicate(InstARM32Label *Label) const {
+      if (Label != nullptr)
+        return LowerInt1BranchTarget(Label);
+      if (NodeTarget)
+        return LowerInt1BranchTarget(NodeTarget);
+      return LowerInt1BranchTarget(LabelTarget);
    }

-    if (CondIfTrue0 == CondARM32::kNone) {
-      assert(CondIfFalse == CondARM32::AL);
-      assert(CondIfTrue1 == CondARM32::kNone);
-    }
+    CfgNode *const NodeTarget = nullptr;
+    InstARM32Label *const LabelTarget = nullptr;
+  };

-    if (CondIfTrue1 != CondARM32::kNone) {
-      assert(CondIfFalse == CondARM32::AL);
-      assert(CondIfTrue1 != CondARM32::kNone);
-    }
+  /// LowerInt1AllowShortCircuit is a helper type used by lowerInt1ForBranch for
+  /// determining which type arithmetic is allowed to be short circuited. This
+  /// is useful for lowering
+  ///
+  ///   t1 = and i1 A, B
+  ///   t2 = and i1 t1, C
+  ///   br i1 t2, label %False, label %True
+  ///
+  /// to
+  ///
+  ///   tst A, 1
+  ///   beq %False
+  ///   tst B, 1
+  ///   beq %False
+  ///   tst C, 1
+  ///   bne %True
+  ///   b %False
+  ///
+  /// Without this information, short circuiting would only allow to short
+  /// circuit a single high level instruction. For example:
+  ///
+  ///   t1 = or i1 A, B
+  ///   t2 = and i1 t1, C
+  ///   br i1 t2, label %False, label %True
+  ///
+  /// cannot be lowered to
+  ///
+  ///   tst A, 1
+  ///   bne %True
+  ///   tst B, 1
+  ///   bne %True
+  ///   tst C, 1
+  ///   beq %True
+  ///   b %False
+  ///
+  /// It needs to be lowered to
+  ///
+  ///   tst A, 1
+  ///   bne Aux
+  ///   tst B, 1
+  ///   beq %False
+  /// Aux:
+  ///   tst C, 1
+  ///   bne %True
+  ///   b %False
+  ///
+  /// TODO(jpp): evaluate if this kind of short circuiting hurts performance (it
+  /// might.)
+  enum LowerInt1AllowShortCircuit {
+    SC_And = 1,
+    SC_Or = 2,
+    SC_All = SC_And | SC_Or,
+  };

-    bool RedefineT = false;
-    if (CondIfFalse != CondARM32::kNone) {
-      _mov(Dest, SrcIfFalse, CondIfFalse);
-      RedefineT = true;
+  /// ShortCircuitCondAndLabel wraps the condition codes that should be used
+  /// after a lowerInt1ForBranch returns to branch to the
+  /// TrueTarget/FalseTarget. If ShortCircuitLabel is not nullptr, then the
+  /// called lowerInt1forBranch created an internal (i.e., short-circuit) label
+  /// used for short circuiting.
+  class ShortCircuitCondAndLabel {
+  public:
+    explicit ShortCircuitCondAndLabel(CondWhenTrue &&C,
+                                      InstARM32Label *L = nullptr)
+        : Cond(std::move(C)), ShortCircuitTarget(L) {}
+    const CondWhenTrue Cond;
+    InstARM32Label *const ShortCircuitTarget;
+
+    CondWhenTrue assertNoLabelAndReturnCond() const {
+      assert(ShortCircuitTarget == nullptr);
+      return Cond;
    }
+  };

-    if (CondIfTrue0 != CondARM32::kNone) {
-      if (RedefineT) {
-        _mov_redefined(Dest, SrcIfTrue, CondIfTrue0);
-      } else {
-        _mov(Dest, SrcIfTrue, CondIfTrue0);
-      }
-      RedefineT = true;
+  /// lowerInt1ForBranch expands Boolean, and returns the condition codes that
+  /// are to be used for branching to the branch's TrueTarget. It may return a
+  /// label that the expansion of Boolean used to short circuit the chain's
+  /// evaluation.
+  ShortCircuitCondAndLabel
+  lowerInt1ForBranch(Operand *Boolean, const LowerInt1BranchTarget &TargetTrue,
+                     const LowerInt1BranchTarget &TargetFalse,
+                     uint32_t ShortCircuitable);
+
+  // _br is a convenience wrapper that emits br instructions to Target.
+  void _br(const LowerInt1BranchTarget &BrTarget,
+           CondARM32::Cond Cond = CondARM32::AL) {
+    assert((BrTarget.NodeTarget == nullptr) !=
+           (BrTarget.LabelTarget == nullptr));
+    if (BrTarget.NodeTarget != nullptr)
+      _br(BrTarget.NodeTarget, Cond);
+    else
+      _br(BrTarget.LabelTarget, Cond);
+  }
+
+  // _br_short_circuit is used when lowering InstArithmetic::And and
+  // InstArithmetic::Or and a short circuit branch is needed.
+  void _br_short_circuit(const LowerInt1BranchTarget &Target,
+                         const CondWhenTrue &Cond) {
+    if (Cond.WhenTrue1 != CondARM32::kNone) {
+      _br(Target, Cond.WhenTrue1);
    }
-
-    if (CondIfTrue1 != CondARM32::kNone) {
-      assert(RedefineT);
-      _mov_redefined(Dest, SrcIfTrue, CondIfTrue1);
+    if (Cond.WhenTrue0 != CondARM32::kNone) {
+      _br(Target, Cond.WhenTrue0);
    }
  }
+  // End of bool folding machinery
+  // --------------------------------------------------------------------------

  /// The Operand can only be a 16-bit immediate or a ConstantRelocatable (with
  /// an upper16 relocation).
@@ -628,9 +835,6 @@ private:
  OperandARM32Mem *formAddressingMode(Type Ty, Cfg *Func, const Inst *LdSt,
                                      Operand *Base);

-  void lowerTruncToFlags(Operand *Src, CondARM32::Cond *CondIfTrue,
-                         CondARM32::Cond *CondIfFalse);
-
  class BoolComputationTracker {
  public:
    BoolComputationTracker() = default;
@@ -658,7 +862,7 @@ private:
        return;
      OstreamLocker L(Func->getContext());
      Ostream &Str = Func->getContext()->getStrDump();
-      Str << "foldable producer:\n  ";
+      Str << "foldable producer:\n";
      for (const auto &Computation : KnownComputations) {
        Str << "    ";
        Computation.second.Instr->dump(Func);
@@ -679,6 +883,7 @@ private:
      // Om1 mode) IsLiveOut will never be set to false, and folding will be
      // disabled.
      bool IsLiveOut = true;
+      int32_t NumUses = 0;
    };

    using BoolComputationMap = std::unordered_map<SizeT, BoolComputationEntry>;

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -82,7 +82,7 @@ public:
    PK_Icmp64,
    PK_Fcmp,
    PK_Trunc,
-    PK_Arith  // A flag-setting arithmetic instruction.
+    PK_Arith // A flag-setting arithmetic instruction.
  };

  /// Currently the actual enum values are not used (other than CK_None), but we

--- a/tests_lit/assembler/arm32/branch-mult-fwd.ll
+++ b/tests_lit/assembler/arm32/branch-mult-fwd.ll
@@ -52,40 +52,40 @@ define internal void @mult_fwd_branches(i32 %a, i32 %b) {

  %cmp = icmp slt i32 %a, %b

-; ASM-NEXT:     ldr     r0, [sp, #8]
-; ASM-NEXT:     ldr     r1, [sp, #4]
-; ASM-NEXT:     cmp     r0, r1
-; ASM-NEXT:     movge   r0, #0
+; ASM-NEXT:     mov     r0, #0
+; ASM-NEXT:     ldr     r1, [sp, #8]
+; ASM-NEXT:     ldr     r2, [sp, #4]
+; ASM-NEXT:     cmp     r1, r2
 ; ASM-NEXT:     movlt   r0, #1
 ; ASM-NEXT:     strb    r0, [sp]

-; DIS-NEXT:   c:        e59d0008
-; DIS-NEXT:  10:        e59d1004
-; DIS-NEXT:  14:        e1500001
-; DIS-NEXT:  18:        a3a00000
+; DIS-NEXT:   c:        e3a00000
+; DIS-NEXT:  10:        e59d1008
+; DIS-NEXT:  14:        e59d2004
+; DIS-NEXT:  18:        e1510002
 ; DIS-NEXT:  1c:        b3a00001
 ; DIS-NEXT:  20:        e5cd0000

-; IASM-NEXT:    .byte 0x8
 ; IASM-NEXT:    .byte 0x0
+; IASM-NEXT:    .byte 0x0
+; IASM-NEXT:    .byte 0xa0
+; IASM-NEXT:    .byte 0xe3
+
+; IASM-NEXT:    .byte 0x8
+; IASM-NEXT:    .byte 0x10
 ; IASM-NEXT:    .byte 0x9d
 ; IASM-NEXT:    .byte 0xe5

 ; IASM-NEXT:    .byte 0x4
-; IASM-NEXT:    .byte 0x10
+; IASM-NEXT:    .byte 0x20
 ; IASM-NEXT:    .byte 0x9d
 ; IASM-NEXT:    .byte 0xe5

-; IASM-NEXT:    .byte 0x1
+; IASM-NEXT:    .byte 0x2
 ; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x50
+; IASM-NEXT:    .byte 0x51
 ; IASM-NEXT:    .byte 0xe1

-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0xa0
-; IASM-NEXT:    .byte 0xa3
-
 ; IASM-NEXT:    .byte 0x1
 ; IASM-NEXT:    .byte 0x0
 ; IASM-NEXT:    .byte 0xa0
@@ -96,23 +96,20 @@ define internal void @mult_fwd_branches(i32 %a, i32 %b) {
  br i1 %cmp, label %then, label %else

 ; ASM-NEXT:     ldrb    r0, [sp]
-; ASM-NEXT:     uxtb    r0, r0
-; ASM-NEXT:     cmp     r0, #0
+; ASM-NEXT:     tst     r0, #1
 ; ASM-NEXT:     bne     .Lmult_fwd_branches$then
 ; ASM-NEXT:     b       .Lmult_fwd_branches$else

 ; DIS-NEXT:  24:        e5dd0000
-; DIS-NEXT:  28:        e6ef0070
-; DIS-NEXT:  2c:        e3500000
-; DIS-NEXT:  30:        1a000000
-; DIS-NEXT:  34:        ea000000
+; DIS-NEXT:  28:        e3100001
+; DIS-NEXT:  2c:        1a000000
+; DIS-NEXT:  30:        ea000000

 ; IASM-NEXT:    ldrb    r0, [sp]
-; IASM-NEXT:    uxtb    r0, r0

+; IASM-NEXT:    .byte 0x1
 ; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x50
+; IASM-NEXT:    .byte 0x10
 ; IASM-NEXT:    .byte 0xe3

 ; IASM-NEXT:    .byte 0x0
@@ -132,7 +129,7 @@ then:
  br label %end
 ; ASM-NEXT:     b       .Lmult_fwd_branches$end

-; DIS-NEXT:  38:        ea000000
+; DIS-NEXT:  34:        ea000000

 ; IASM-NEXT:    .byte 0x0
 ; IASM-NEXT:    .byte 0x0
@@ -146,7 +143,7 @@ else:
  br label %end
 ; ASM-NEXT:     b       .Lmult_fwd_branches$end

-; DIS-NEXT:  3c:        eaffffff
+; DIS-NEXT:  38:        eaffffff

 ; IASM-NEXT:    .byte 0xff
 ; IASM-NEXT:    .byte 0xff
@@ -163,8 +160,8 @@ end:
 ; ASM-NEXT:     add     sp, sp, #12
 ; ASM-NEXT:     bx      lr

-; DIS-NEXT:  40:        e28dd00c
-; DIS-NEXT:  44:        e12fff1e
+; DIS-NEXT:  3c:        e28dd00c
+; DIS-NEXT:  40:        e12fff1e

 ; IASM-NEXT:    .byte 0xc
 ; IASM-NEXT:    .byte 0xd0

--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -850,10 +850,7 @@ entry:

 ; ARM32-LABEL: trunc64To1
 ; ARM32-OM1: and r0, r0, #1
-; ARM32-OM1: and r0, r0, #1
-; ARM32-O2: tst r0, #1
-; ARM32-O2: moveq [[RES:r[0-9]+]], #0
-; ARM32-O2: movne [[RES]], #1
+; ARM32-O2: and r0, r0, #1

 define internal i64 @sext32To64(i32 %a) {
 entry:
@@ -924,12 +921,10 @@ entry:
 ; OPTM1: sar {{.*}},0x1f

 ; ARM32-LABEL: sext1To64
-; ARM32-OM1: lsl {{.*}}, #31
-; ARM32-OM1: asr {{.*}}, #31
-; ARM32-O2: tst r0, #1
-; ARM32-O2: mvn [[M1:r[0-9]+]], #0
-; ARM32-O2: moveq [[RES:r[0-9]+]], #0
-; ARM32-O2: movne [[RES]], [[M1]]
+; ARM32: mov {{.*}}, #0
+; ARM32: tst {{.*}}, #1
+; ARM32: mvn {{.*}}, #0
+; ARM32: movne

 define internal i64 @zext32To64(i32 %a) {
 entry:
@@ -998,11 +993,9 @@ entry:
 ; OPTM1: mov {{.*}},0x0

 ; ARM32-LABEL: zext1To64
-; ARM32-OM1: and {{.*}}, #1
-; ARM32-OM1: mov {{.*}}, #0
-; ARM32-O2: tst r0, #1
-; ARM32-O2: moveq {{[^,]*}}, #0
-; ARM32-O2: movne {{[^,]*}}, #1
+; ARM32: and {{.*}}, #1
+; ARM32: mov {{.*}}, #0
+; ARM32: bx

 define internal void @icmpEq64(i64 %a, i64 %b, i64 %c, i64 %d) {
 entry:
@@ -1061,18 +1054,15 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; ARM32-LABEL: icmpEq64
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: movne
-; ARM32-OM1: moveq
-; ARM32-OM1: cmp
-; ARM32-O2: bne
-; ARM32: bl
+; ARM32-OM1: tst
+; ARM32: bne
+; ARM32: bl {{.*}} <func>
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: movne
-; ARM32-OM1: moveq
-; ARM32-OM1: cmp
-; ARM32-O2: bne
-; ARM32: bl
+; ARM32-OM1: tst
+; ARM32: bne
+; ARM32: bl {{.*}} <func>
+; ARM32: bx

 declare void @func()

@@ -1133,16 +1123,14 @@ if.end3:                                          ; preds = %if.end, %if.then2
 ; ARM32-LABEL: icmpNe64
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: moveq
-; ARM32-OM1: movne
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
+; ARM32-OM1: bne
 ; ARM32-O2: beq
-; ARM32: bl
+; ARM32: bl {{.*}} <func>
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: moveq
-; ARM32-OM1: movne
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
+; ARM32-OM1: bne
 ; ARM32-O2: beq
 ; ARM32: bl

@@ -1189,16 +1177,14 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; ARM32-LABEL: icmpGt64
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: movls
-; ARM32-OM1: movhi
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
+; ARM32-OM1: bne
 ; ARM32-O2: bls
 ; ARM32: bl
 ; ARM32: cmp
 ; ARM32: sbcs
-; ARM32-OM1: movge
-; ARM32-OM1: movlt
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
+; ARM32-OM1: bne
 ; ARM32-O2: bge
 ; ARM32: bl

@@ -1245,16 +1231,14 @@ if.end3:                                          ; preds = %if.end, %if.then2
 ; ARM32-LABEL: icmpGe64
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: movcc
-; ARM32-OM1: movcs
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
+; ARM32-OM1: bne
 ; ARM32-O2: bcc
 ; ARM32: bl
 ; ARM32: cmp
 ; ARM32: sbcs
-; ARM32-OM1: movlt
-; ARM32-OM1: movge
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
+; ARM32-OM1: bne
 ; ARM32-O2: blt
 ; ARM32: bl

@@ -1301,16 +1285,14 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; ARM32-LABEL: icmpLt64
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: movcs
-; ARM32-OM1: movcc
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
+; ARM32-OM1: bne
 ; ARM32-O2: bcs
 ; ARM32: bl
 ; ARM32: cmp
 ; ARM32: sbcs
-; ARM32-OM1: movge
-; ARM32-OM1: movlt
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
+; ARM32-OM1: bne
 ; ARM32-O2: bge
 ; ARM32: bl

@@ -1357,15 +1339,14 @@ if.end3:                                          ; preds = %if.end, %if.then2
 ; ARM32-LABEL: icmpLe64
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: movhi
-; ARM32-OM1: movls
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
+; ARM32-OM1: bne
 ; ARM32-O2: bhi
 ; ARM32: bl
 ; ARM32: cmp
 ; ARM32: sbcs
-; ARM32-OM1: movlt
-; ARM32-OM1: movge
+; ARM32-OM1: tst
+; ARM32-OM1: bne
 ; ARM32-O2: blt
 ; ARM32: bl

@@ -1384,7 +1365,7 @@ entry:
 ; OPTM1: je

 ; ARM32-LABEL: icmpEq64Bool
-; ARM32: movne
+; ARM32: mov
 ; ARM32: moveq

 define internal i32 @icmpNe64Bool(i64 %a, i64 %b) {
@@ -1402,7 +1383,7 @@ entry:
 ; OPTM1: jne

 ; ARM32-LABEL: icmpNe64Bool
-; ARM32: moveq
+; ARM32: mov
 ; ARM32: movne

 define internal i32 @icmpSgt64Bool(i64 %a, i64 %b) {
@@ -1426,9 +1407,9 @@ entry:
 ; OPTM1: ja

 ; ARM32-LABEL: icmpSgt64Bool
+; ARM32: mov
 ; ARM32: cmp
 ; ARM32: sbcs
-; ARM32: movge
 ; ARM32: movlt

 define internal i32 @icmpUgt64Bool(i64 %a, i64 %b) {
@@ -1452,9 +1433,9 @@ entry:
 ; OPTM1: ja

 ; ARM32-LABEL: icmpUgt64Bool
+; ARM32: mov
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32: movls
 ; ARM32: movhi

 define internal i32 @icmpSge64Bool(i64 %a, i64 %b) {
@@ -1478,9 +1459,9 @@ entry:
 ; OPTM1: jae

 ; ARM32-LABEL: icmpSge64Bool
+; ARM32: mov
 ; ARM32: cmp
 ; ARM32: sbcs
-; ARM32: movlt
 ; ARM32: movge

 define internal i32 @icmpUge64Bool(i64 %a, i64 %b) {
@@ -1504,9 +1485,9 @@ entry:
 ; OPTM1: jae

 ; ARM32-LABEL: icmpUge64Bool
+; ARM32: mov
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32: movcc
 ; ARM32: movcs

 define internal i32 @icmpSlt64Bool(i64 %a, i64 %b) {
@@ -1530,9 +1511,9 @@ entry:
 ; OPTM1: jb

 ; ARM32-LABEL: icmpSlt64Bool
+; ARM32: mov
 ; ARM32: cmp
 ; ARM32: sbcs
-; ARM32: movge
 ; ARM32: movlt

 define internal i32 @icmpUlt64Bool(i64 %a, i64 %b) {
@@ -1556,9 +1537,9 @@ entry:
 ; OPTM1: jb

 ; ARM32-LABEL: icmpUlt64Bool
+; ARM32: mov
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32: movcs
 ; ARM32: movcc

 define internal i32 @icmpSle64Bool(i64 %a, i64 %b) {
@@ -1582,9 +1563,9 @@ entry:
 ; OPTM1: jbe

 ; ARM32-LABEL: icmpSle64Bool
+; ARM32: mov
 ; ARM32: cmp
 ; ARM32: sbcs
-; ARM32: movlt
 ; ARM32: movge

 define internal i32 @icmpUle64Bool(i64 %a, i64 %b) {
@@ -1608,9 +1589,9 @@ entry:
 ; OPTM1: jbe

 ; ARM32-LABEL: icmpUle64Bool
+; ARM32: mov
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32: movhi
 ; ARM32: movls

 define internal i64 @load64(i32 %a) {
@@ -1701,9 +1682,7 @@ entry:
 ; ARM32-LABEL: select64VarVar
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: movcs
-; ARM32-OM1: movcc
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
 ; ARM32-OM1: movne
 ; ARM32-O2: movcc
 ; ARM32-OM1: movne
@@ -1734,19 +1713,17 @@ entry:
 ; OPTM1: cmovne

 ; ARM32-LABEL: select64VarConst
+; ARM32: mov
+; ARM32: mov
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: movcs
-; ARM32-OM1: movcc
-; ARM32-OM1: cmp
-; ARM32: movw
-; ARM32: movt
+; ARM32-OM1: tst
 ; ARM32-OM1: movne
 ; ARM32-O2: movcc
-; ARM32: movw
-; ARM32: movt
 ; ARM32-OM1: movne
 ; ARM32-O2: movcc
+; ARM32-O2: mov
+; ARM32-O2: mov

 define internal i64 @select64ConstVar(i64 %a, i64 %b) {
 entry:
@@ -1775,9 +1752,7 @@ entry:
 ; ARM32-LABEL: select64ConstVar
 ; ARM32: cmp
 ; ARM32: cmpeq
-; ARM32-OM1: movcs
-; ARM32-OM1: movcc
-; ARM32-OM1: cmp
+; ARM32-OM1: tst
 ; ARM32: movw
 ; ARM32: movt
 ; ARM32-OM1: movne

--- a/tests_lit/llvm2ice_tests/bool-folding.ll
+++ b/tests_lit/llvm2ice_tests/bool-folding.ll
@@ -85,7 +85,7 @@ branch2:
 ; ARM32-LABEL: no_fold_cmp_br_liveout
 ; ARM32: cmp
 ; ARM32: movlt [[REG:r[0-9]+]]
-; ARM32: cmp [[REG]], #0
+; ARM32: tst [[REG]], #1
 ; ARM32: beq


@@ -108,11 +108,11 @@ branch2:
 ; CHECK: cmp
 ; CHECK: je
 ; ARM32-LABEL: no_fold_cmp_br_non_whitelist
+; ARM32: mov [[R:r[0-9]+]], #0
 ; ARM32: cmp r0, r1
-; ARM32: movge [[R:r[0-9]+]], #0
 ; ARM32: movlt [[R]], #1
-; ARM32: cmp r0, r1
-; ARM32: bge
+; ARM32: tst [[R]], #1
+; ARM32: beq
 ; ARM32: bx lr
 ; ARM32: mov r0, #2
 ; ARM32: bx lr
@@ -168,10 +168,10 @@ entry:
 ; CHECK: cmovl
 ; CHECK: cmovl
 ; ARM32-LABEL: fold_cmp_select_64_undef
+; ARM32: mov
+; ARM32: mov
 ; ARM32: cmp {{r[0-9]+}}, r0
-; ARM32: movge
 ; ARM32: movlt
-; ARM32: movge
 ; ARM32: movlt
 ; ARM32: bx lr

@@ -218,14 +218,17 @@ entry:
 ; CHECK: add
 ; CHECK: add
 ; ARM32-LABEL: fold_cmp_select_multi
-; ARM32: cmp r0, r1
-; ARM32: movlt {{r[0-9]+}}, r0
-; ARM32: cmp r0, r1
-; ARM32: movlt {{r[0-9]+}}, r1
-; ARM32: cmp r0, r1
-; ARM32: movlt {{r[0-9]+}}, #123
-; ARM32: add
-; ARM32: add
+; ARM32: mov
+; ARM32: cmp
+; ARM32: movlt {{.*}}, #1
+; ARM32: mov
+; ARM32: tst {{.*}}, #1
+; ARM32: movne
+; ARM32: mov
+; ARM32: tst {{.*}}, #1
+; ARM32: movne
+; ARM32: tst {{.*}}, #1
+; ARM32: movne {{.*}}, #123
 ; ARM32: bx lr


@@ -254,22 +257,17 @@ next:
 ; CHECK: add
 ; CHECK: add
 ; ARM32-LABEL: no_fold_cmp_select_multi_liveout
-; ARM32-LABEL: fold_cmp_select_multi
+; ARM32: mov
 ; ARM32: cmp r0, r1
-; ARM32: movge [[T0:r[0-9]+]], #0
-; ARM32: movlt [[T0]], #1
-; ARM32: uxtb [[T1:r[0-9]+]], [[T1]]
-; ARM32-NEXT: cmp [[T1]], #0
-; ARM32: movne [[T2:r[0-9]+]], r0
-; ARM32: uxtb [[T3:r[0-9]+]], [[T3]]
-; ARM32-NEXT: cmp [[T3]], #0
-; ARM32: movne [[T4:r[0-9]+]], r1
-; ARM32-LABEL: .Lno_fold_cmp_select_multi_liveout$next:
-; ARM32: uxtb [[T5:r[0-9]+]], [[T5]]
-; ARM32: cmp [[T5]], #0
-; ARM32: movne [[T6:r[0-9]+]], #123
-; ARM32: add
-; ARM32: add
+; ARM32: movlt
+; ARM32: mov
+; ARM32: tst
+; ARM32: movne
+; ARM32: mov
+; ARM32: tst
+; ARM32: movne
+; ARM32: tst
+; ARM32: movne
 ; ARM32: bx lr

 ; Cmp/multi-select non-folding because of extra non-whitelisted uses.
@@ -300,19 +298,133 @@ entry:
 ; CHECK: add
 ; CHECK: add
 ; ARM32-LABEL: no_fold_cmp_select_multi_non_whitelist
+; ARM32: mov
 ; ARM32: cmp r0, r1
-; ARM32: movge [[R0:r[0-9]+]]
-; ARM32: movlt [[R0]]
-; ARM32: cmp r0, r1
-; ARM32: movge [[R1:r[0-9]+]]
-; ARM32: movlt [[R1]]
-; ARM32: cmp r0, r1
-; ARM32: movge [[R2:r[0-9]+]]
-; ARM32: movlt [[R2]]
-; ARM32: cmp r0, r1
-; ARM32: movge [[R3:r[0-9]+]]
-; ARM32: movlt [[R3]]
-; ARM32: add
-; ARM32: add
-; ARM32: add
+; ARM32: movlt
+; ARM32: mov
+; ARM32: tst
+; ARM32: movne
+; ARM32: mov
+; ARM32: tst
+; ARM32: movne
+; ARM32: tst
+; ARM32: movne
 ; ARM32: bx lr
+
+define internal i32 @br_i1_folding2_and(i32 %arg1, i32 %arg2) {
+  %t0 = trunc i32 %arg1 to i1
+  %t1 = trunc i32 %arg2 to i1
+
+  %t2 = and i1 %t0, %t1
+  br i1 %t2, label %target_true, label %target_false
+
+target_true:
+  ret i32 1
+
+target_false:
+  ret i32 0
+}
+; ARM32-LABEL: br_i1_folding2_and
+; ARM32: tst r0, #1
+; ARM32: beq {{.*}}target_false
+; ARM32: tst r1, #1
+; ARM32: beq {{.*}}target_false
+
+define internal i32 @br_i1_folding2_or(i32 %arg1, i32 %arg2) {
+  %t0 = trunc i32 %arg1 to i1
+  %t1 = trunc i32 %arg2 to i1
+
+  %t2 = or i1 %t0, %t1
+  br i1 %t2, label %target_true, label %target_false
+
+target_true:
+  ret i32 1
+
+target_false:
+  ret i32 0
+}
+; ARM32-LABEL: br_i1_folding2_or
+; ARM32: tst r0, #1
+; ARM32: bne {{.*}}target_true
+; ARM32: tst r1, #1
+; ARM32: beq {{.*}}target_false
+
+define internal i32 @br_i1_folding3_and_or(i32 %arg1, i32 %arg2, i32 %arg3) {
+  %t0 = trunc i32 %arg1 to i1
+  %t1 = trunc i32 %arg2 to i1
+  %t2 = trunc i32 %arg3 to i1
+
+  %t3 = and i1 %t0, %t1
+  %t4 = or i1 %t3, %t2
+
+  br i1 %t4, label %target_true, label %target_false
+
+target_true:
+  ret i32 1
+
+target_false:
+  ret i32 0
+}
+; ARM32-LABEL: br_i1_folding3_and_or
+; ARM32: tst r0, #1
+; ARM32: beq
+; ARM32: tst r1, #1
+; ARM32: bne {{.*}}target_true
+; ARM32: tst r2, #1
+; ARM32: beq {{.*}}target_false
+
+define internal i32 @br_i1_folding3_or_and(i32 %arg1, i32 %arg2, i32 %arg3) {
+  %t0 = trunc i32 %arg1 to i1
+  %t1 = trunc i32 %arg2 to i1
+  %t2 = trunc i32 %arg3 to i1
+
+  %t3 = or i1 %t0, %t1
+  %t4 = and i1 %t3, %t2
+
+  br i1 %t4, label %target_true, label %target_false
+
+target_true:
+  ret i32 1
+
+target_false:
+  ret i32 0
+}
+; ARM32-LABEL: br_i1_folding3_or_and
+; ARM32: tst r0, #1
+; ARM32: bne
+; ARM32: tst r1, #1
+; ARM32: beq {{.*}}target_false
+; ARM32: tst r2, #1
+; ARM32: beq {{.*}}target_false
+
+define internal i32 @br_i1_folding4(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4,
+                                    i32 %arg5) {
+  %t0 = trunc i32 %arg1 to i1
+  %t1 = trunc i32 %arg2 to i1
+  %t2 = trunc i32 %arg3 to i1
+  %t3 = trunc i32 %arg4 to i1
+  %t4 = trunc i32 %arg5 to i1
+
+  %t5 = or i1 %t0, %t1
+  %t6 = and i1 %t5, %t2
+  %t7 = and i1 %t3, %t4
+  %t8 = or i1 %t6, %t7
+  br i1 %t8, label %target_true, label %target_false
+
+target_true:
+  ret i32 1
+
+target_false:
+  ret i32 0
+}
+; ARM32-LABEL: br_i1_folding4
+; ARM32: tst r0, #1
+; ARM32: bne
+; ARM32: tst r1, #1
+; ARM32: beq
+; ARM32: tst r2, #1
+; ARM32: bne     {{.*}}target_true
+; ARM32: tst     r3, #1
+; ARM32: beq     {{.*}}target_false
+; ARM32: tst     r4, #1
+; ARM32: beq     {{.*}}target_false
--- a/tests_lit/llvm2ice_tests/branch-opt.ll
+++ b/tests_lit/llvm2ice_tests/branch-opt.ll
@@ -92,7 +92,6 @@ target:
 ; OM1: call
 ; OM1: ret

-; Note that compare and branch folding isn't implemented yet (unlike x86-32).
 ; ARM32O2-LABEL: testCondFallthroughToNextBlock
 ; ARM32O2: cmp {{.*}}, #123
 ; ARM32O2-NEXT: bge
@@ -102,10 +101,10 @@ target:
 ; ARM32O2: bx lr

 ; ARM32OM1-LABEL: testCondFallthroughToNextBlock
+; ARM32OM1: mov {{.*}}, #0
 ; ARM32OM1: cmp {{.*}}, #123
-; ARM32OM1: movlt {{.*}}, #0
 ; ARM32OM1: movge {{.*}}, #1
-; ARM32OM1: cmp {{.*}}, #0
+; ARM32OM1: tst {{.*}}, #1
 ; ARM32OM1: bne
 ; ARM32OM1: b
 ; ARM32OM1: bl
@@ -161,7 +160,7 @@ target:
 ; ARM32OM1-LABEL: testCondTargetNextBlock
 ; ARM32OM1: cmp {{.*}}, #123
 ; ARM32OM1: movge {{.*}}, #1
-; ARM32OM1: cmp {{.*}}, #0
+; ARM32OM1: tst {{.*}}, #1
 ; ARM32OM1: bne
 ; ARM32OM1: b
 ; ARM32OM1: bl

--- a/tests_lit/llvm2ice_tests/fp.cmp.ll
+++ b/tests_lit/llvm2ice_tests/fp.cmp.ll
@@ -58,13 +58,13 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; ARM32-LABEL: fcmpEq
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32-OM1: movne [[R0:r[0-9]+]], #0
+; ARM32-OM1: mov [[R0:r[0-9]+]], #0
 ; ARM32-OM1: moveq [[R0]], #1
 ; ARM32-O2: bne
 ; ARM32: bl func
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32-OM1: movne [[R1:r[0-9]+]], #0
+; ARM32-OM1: mov [[R1:r[0-9]+]], #0
 ; ARM32-OM1: moveq [[R1]], #1
 ; ARM32-O2: bne

@@ -115,12 +115,12 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; ARM32-LABEL: fcmpNe
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32-OM1: moveq [[R0:r[0-9]+]], #0
+; ARM32-OM1: mov [[R0:r[0-9]+]], #0
 ; ARM32-OM1: movne [[R0]], #1
 ; ARM32-O2: beq
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32-OM1: moveq [[R1:r[0-9]+]], #0
+; ARM32-OM1: mov [[R1:r[0-9]+]], #0
 ; ARM32-OM1: movne [[R1]], #1
 ; ARM32-O2: beq

@@ -161,12 +161,12 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; ARM32-LABEL: fcmpGt
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32-OM1: movle [[R0:r[0-9]+]], #0
+; ARM32-OM1: mov [[R0:r[0-9]+]], #0
 ; ARM32-OM1: movgt [[R0]], #1
 ; ARM32-O2: ble
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32-OM1: movle [[R1:r[0-9]+]], #0
+; ARM32-OM1: mov [[R1:r[0-9]+]], #0
 ; ARM32-OM1: movgt [[R1]], #1
 ; ARM32-O2: ble

@@ -207,12 +207,12 @@ if.end3:                                          ; preds = %if.end, %if.then2
 ; ARM32-LABEL: fcmpGe
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32-OM1: movge [[R0:r[0-9]+]], #0
+; ARM32-OM1: mov [[R0:r[0-9]+]], #0
 ; ARM32-OM1: movlt [[R0]], #1
 ; ARM32-O2: blt
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32-OM1: movge [[R1:r[0-9]+]], #0
+; ARM32-OM1: mov [[R1:r[0-9]+]], #0
 ; ARM32-OM1: movlt [[R1]], #1
 ; ARM32-O2: blt

@@ -253,12 +253,12 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; ARM32-LABEL: fcmpLt
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32-OM1: movpl [[R0:r[0-9]+]], #0
+; ARM32-OM1: mov [[R0:r[0-9]+]], #0
 ; ARM32-OM1: movmi [[R0]], #1
 ; ARM32-O2: bpl
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32-OM1: movpl [[R1:r[0-9]+]], #0
+; ARM32-OM1: mov [[R1:r[0-9]+]], #0
 ; ARM32-OM1: movmi [[R1]], #1
 ; ARM32-O2: bpl

@@ -299,12 +299,12 @@ if.end3:                                          ; preds = %if.end, %if.then2
 ; ARM32-LABEL: fcmpLe
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32-OM1: movls [[R0:r[0-9]+]], #0
+; ARM32-OM1: mov [[R0:r[0-9]+]], #0
 ; ARM32-OM1: movhi [[R0]], #1
 ; ARM32-O2: bhi
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32-OM1: movls [[R1:r[0-9]+]], #0
+; ARM32-OM1: mov [[R1:r[0-9]+]], #0
 ; ARM32-OM1: movhi [[R1]], #1
 ; ARM32-O2: bhi

@@ -341,9 +341,10 @@ entry:
 ; CHECK: jne
 ; CHECK: jp
 ; ARM32-LABEL: fcmpOeqFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movne [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: moveq [[R]], #1

 define internal i32 @fcmpOeqDouble(double %a, double %b) {
@@ -357,9 +358,10 @@ entry:
 ; CHECK: jne
 ; CHECK: jp
 ; ARM32-LABEL: fcmpOeqDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movne [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: moveq [[R]], #1

 define internal i32 @fcmpOgtFloat(float %a, float %b) {
@@ -372,9 +374,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: seta
 ; ARM32-LABEL: fcmpOgtFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movle [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movgt [[R]], #1

 define internal i32 @fcmpOgtDouble(double %a, double %b) {
@@ -387,9 +390,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: seta
 ; ARM32-LABEL: fcmpOgtDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movle [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movgt [[R]], #1

 define internal i32 @fcmpOgeFloat(float %a, float %b) {
@@ -402,9 +406,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: setae
 ; ARM32-LABEL: fcmpOgeFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movlt [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movge [[R]], #1

 define internal i32 @fcmpOgeDouble(double %a, double %b) {
@@ -417,9 +422,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: setae
 ; ARM32-LABEL: fcmpOgeDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movlt [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movge [[R]], #1

 define internal i32 @fcmpOltFloat(float %a, float %b) {
@@ -432,9 +438,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: seta
 ; ARM32-LABEL: fcmpOltFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movpl [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movmi [[R]], #1

 define internal i32 @fcmpOltDouble(double %a, double %b) {
@@ -447,9 +454,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: seta
 ; ARM32-LABEL: fcmpOltDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movpl [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movmi [[R]], #1

 define internal i32 @fcmpOleFloat(float %a, float %b) {
@@ -462,9 +470,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: setae
 ; ARM32-LABEL: fcmpOleFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movhi [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movls [[R]], #1

 define internal i32 @fcmpOleDouble(double %a, double %b) {
@@ -477,9 +486,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: setae
 ; ARM32-LABEL: fcmpOleDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movhi [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movls [[R]], #1

 define internal i32 @fcmpOneFloat(float %a, float %b) {
@@ -492,9 +502,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: setne
 ; ARM32-LABEL: fcmpOneFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movmi [[R]], #1
 ; ARM32: movgt [[R]], #1

@@ -508,9 +519,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: setne
 ; ARM32-LABEL: fcmpOneDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movmi [[R]], #1
 ; ARM32: movgt [[R]], #1

@@ -524,9 +536,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: setnp
 ; ARM32-LABEL: fcmpOrdFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movvs [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movvc [[R]], #1

 define internal i32 @fcmpOrdDouble(double %a, double %b) {
@@ -539,9 +552,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: setnp
 ; ARM32-LABEL: fcmpOrdDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movvs [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movvc [[R]], #1

 define internal i32 @fcmpUeqFloat(float %a, float %b) {
@@ -554,9 +568,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: sete
 ; ARM32-LABEL: fcmpUeqFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: moveq [[R]], #1
 ; ARM32: movvs [[R]], #1

@@ -570,9 +585,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: sete
 ; ARM32-LABEL: fcmpUeqDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: moveq [[R]], #1
 ; ARM32: movvs [[R]], #1

@@ -586,9 +602,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: setb
 ; ARM32-LABEL: fcmpUgtFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movls [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movhi [[R]], #1

 define internal i32 @fcmpUgtDouble(double %a, double %b) {
@@ -601,9 +618,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: setb
 ; ARM32-LABEL: fcmpUgtDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movls [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movhi [[R]], #1

 define internal i32 @fcmpUgeFloat(float %a, float %b) {
@@ -616,9 +634,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: setbe
 ; ARM32-LABEL: fcmpUgeFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movmi [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movpl [[R]], #1

 define internal i32 @fcmpUgeDouble(double %a, double %b) {
@@ -631,9 +650,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: setbe
 ; ARM32-LABEL: fcmpUgeDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movmi [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movpl [[R]], #1

 define internal i32 @fcmpUltFloat(float %a, float %b) {
@@ -646,9 +666,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: setb
 ; ARM32-LABEL: fcmpUltFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movge [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movlt [[R]], #1

 define internal i32 @fcmpUltDouble(double %a, double %b) {
@@ -661,9 +682,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: setb
 ; ARM32-LABEL: fcmpUltDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movge [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movlt [[R]], #1

 define internal i32 @fcmpUleFloat(float %a, float %b) {
@@ -676,9 +698,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: setbe
 ; ARM32-LABEL: fcmpUleFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movgt [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movle [[R]], #1

 define internal i32 @fcmpUleDouble(double %a, double %b) {
@@ -691,9 +714,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: setbe
 ; ARM32-LABEL: fcmpUleDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movgt [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movle [[R]], #1

 define internal i32 @fcmpUneFloat(float %a, float %b) {
@@ -707,9 +731,10 @@ entry:
 ; CHECK: jne
 ; CHECK: jp
 ; ARM32-LABEL: fcmpUneFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: moveq [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movne [[R]], #1

 define internal i32 @fcmpUneDouble(double %a, double %b) {
@@ -723,9 +748,10 @@ entry:
 ; CHECK: jne
 ; CHECK: jp
 ; ARM32-LABEL: fcmpUneDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: moveq [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movne [[R]], #1

 define internal i32 @fcmpUnoFloat(float %a, float %b) {
@@ -738,9 +764,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: setp
 ; ARM32-LABEL: fcmpUnoFloat
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f32
 ; ARM32: vmrs
-; ARM32: movvc [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movvs [[R]], #1

 define internal i32 @fcmpUnoDouble(double %a, double %b) {
@@ -753,9 +780,10 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: setp
 ; ARM32-LABEL: fcmpUnoDouble
+; ARM32-O2: mov [[R:r[0-9]+]], #0
 ; ARM32: vcmp.f64
 ; ARM32: vmrs
-; ARM32: movvc [[R:r[0-9]+]], #0
+; ARM32-OM1: mov [[R:r[0-9]+]], #0
 ; ARM32: movvs [[R]], #1

 define internal i32 @fcmpTrueFloat(float %a, float %b) {

--- a/tests_lit/llvm2ice_tests/select-opt.ll
+++ b/tests_lit/llvm2ice_tests/select-opt.ll
@@ -51,14 +51,12 @@ declare void @useInt(i32 %x)
 ; CHECK:      ret
 ; ARM32-LABEL: testSelect
 ; ARM32: cmp
-; ARM32-OM1: cmp
 ; ARM32: bl {{.*}} useInt
-; ARM32: cmp
-; ARM32-Om1: cmp
 ; ARM32-Om1: mov {{.*}}, #20
+; ARM32-O2: mov [[REG:r[0-9]+]], #20
+; ARM32: tst
 ; ARM32-Om1: movne {{.*}}, #10
-; ARM32-O2: movle [[REG:r[0-9]+]], #20
-; ARM32-O2: movgt [[REG]], #10
+; ARM32-O2: movne [[REG]], #10
 ; ARM32: bl {{.*}} useInt
 ; ARM32: bl {{.*}} useInt
 ; ARM32: bl {{.*}} useInt

--- a/tests_lit/llvm2ice_tests/test_i1.ll
+++ b/tests_lit/llvm2ice_tests/test_i1.ll
@@ -15,8 +15,6 @@
 ; RUN:   | %if --need=target_ARM32 --need=allow_dump \
 ; RUN:   --command FileCheck --check-prefix ARM32 %s

-; TODO(jvoung): test this.
-
 ; Test that and with true uses immediate 1, not -1.
 define internal i32 @testAndTrue(i32 %arg) {
 entry:
@@ -66,9 +64,7 @@ entry:
 ; CHECK-LABEL: testTrunc
 ; CHECK: and {{.*}},0x1
 ; ARM32-LABEL: testTrunc
-; ARM32: tst r0, #1
-; ARM32: moveq [[REG:r[0-9]*]], #0
-; ARM32: movne [[REG]], #1
+; ARM32: and {{.*}}, #1

 ; Test zext to i8.
 define internal i32 @testZextI8(i32 %arg) {
@@ -84,10 +80,8 @@ entry:
 ; match the zext i1 instruction (NOTE: no mov need between i1 and i8).
 ; CHECK-NOT: and {{.*}},0x1
 ; ARM32-LABEL: testZextI8
-; ARM32: tst r0, #1
-; ARM32: moveq [[REG:r[0-9]*]], #0
-; ARM32: movne [[REG]], #1
-; ARM32: uxtb [[REG]]
+; ARM32: {{.*}}, #1
+; ARM32: uxtb

 ; Test zext to i16.
 define internal i32 @testZextI16(i32 %arg) {
@@ -105,10 +99,8 @@ entry:
 ; CHECK-NOT: and [[REG]],0x1

 ; ARM32-LABEL: testZextI16
-; ARM32: tst r0, #1
-; ARM32: moveq [[REG:r[0-9]*]], #0
-; ARM32: movne [[REG]], #1
-; ARM32: uxth [[REG]]
+; ARM32: and {{.*}}, #1
+; ARM32: uxth

 ; Test zext to i32.
 define internal i32 @testZextI32(i32 %arg) {
@@ -124,9 +116,7 @@ entry:
 ; CHECK: movzx
 ; CHECK-NOT: and {{.*}},0x1
 ; ARM32-LABEL: testZextI32
-; ARM32: tst r0, #1
-; ARM32: moveq [[REG:r[0-9]*]], #0
-; ARM32: movne [[REG]], #1
+; ARM32: and {{.*}}, #1

 ; Test zext to i64.
 define internal i64 @testZextI64(i32 %arg) {
@@ -142,10 +132,8 @@ entry:
 ; CHECK: movzx
 ; CHECK: mov {{.*}},0x0
 ; ARM32-LABEL: testZextI64
-; ARM32: tst r0, #1
-; ARM32: mov r{{[0-9]*}}, #0
-; ARM32: moveq [[REG:r[0-9]*]], #0
-; ARM32: movne [[REG]], #1
+; ARM32: and {{.*}}, #1
+; ARM32: mov {{.*}}, #0

 ; Test sext to i8.
 define internal i32 @testSextI8(i32 %arg) {
@@ -163,11 +151,11 @@ entry:
 ; CHECK-NEXT: sar [[REG]],0x7
 ;
 ; ARM32-LABEL: testSextI8
-; ARM32: tst r0, #1
-; ARM32: mvn [[REG_M1:r[0-9]*]], #0
-; ARM32: moveq [[REG:r[0-9]*]], #0
-; ARM32: movne [[REG]], [[REG_M1]]
-; ARM32: sxtb [[REG]]
+; ARM32: mov {{.*}}, #0
+; ARM32: tst {{.*}}, #1
+; ARM32: mvn {{.*}}, #0
+; ARM32: movne
+; ARM32: sxtb

 ; Test sext to i16.
 define internal i32 @testSextI16(i32 %arg) {
@@ -186,11 +174,11 @@ entry:
 ; CHECK-NEXT: sar [[REG]],0xf

 ; ARM32-LABEL: testSextI16
-; ARM32: tst r0, #1
-; ARM32: mvn [[REG_M1:r[0-9]*]], #0
-; ARM32: moveq [[REG:r[0-9]*]], #0
-; ARM32: movne [[REG]], [[REG_M1]]
-; ARM32: sxth [[REG]]
+; ARM32: mov {{.*}}, #0
+; ARM32: tst {{.*}}, #1
+; ARM32: mvn {{.*}}, #0
+; ARM32: movne
+; ARM32: sxth

 ; Test sext to i32.
 define internal i32 @testSextI32(i32 %arg) {
@@ -208,10 +196,10 @@ entry:
 ; CHECK-NEXT: sar [[REG]],0x1f

 ; ARM32-LABEL: testSextI32
-; ARM32: tst r0, #1
-; ARM32: mvn [[REG_M1:r[0-9]*]], #0
-; ARM32: moveq [[REG:r[0-9]*]], #0
-; ARM32: movne [[REG]], [[REG_M1]]
+; ARM32: mov {{.*}}, #0
+; ARM32: tst {{.*}}, #1
+; ARM32: mvn {{.*}}, #0
+; ARM32: movne

 ; Test sext to i64.
 define internal i64 @testSextI64(i32 %arg) {
@@ -229,11 +217,11 @@ entry:
 ; CHECK-NEXT: sar [[REG]],0x1f

 ; ARM32-LABEL: testSextI64
-; ARM32: tst r0, #1
-; ARM32: mvn [[REG_M1:r[0-9]*]], #0
-; ARM32: moveq [[REG:r[0-9]*]], #0
-; ARM32: movne [[REG]], [[REG_M1]]
-; ARM32: mov r{{[0-9]+}}, [[REG]]
+; ARM32: mov {{.*}}, #0
+; ARM32: tst {{.*}}, #1
+; ARM32: mvn {{.*}}, #0
+; ARM32: movne [[REG:r[0-9]+]]
+; ARM32: mov {{.*}}, [[REG]]

 ; Kind of like sext i1 to i32, but with an immediate source. On ARM,
 ; sxtb cannot take an immediate operand, so make sure it's using a reg.
@@ -248,9 +236,10 @@ define internal i32 @testSextTrue() {
 ; CHECK-NEXT: shl
 ; CHECK-NEXT: sar
 ; ARM32-LABEL: testSextTrue
-; ARM32: mov{{.*}}, #1
-; ARM32: lsl
-; ARM32: asr
+; ARM32: mov {{.*}}, #0
+; ARM32: tst {{.*}}, #1
+; ARM32: mvn {{.*}}, #0
+; ARM32: movne

 define internal i32 @testZextTrue() {
  %result = zext i1 true to i32