Subzero ARM: do lowerIcmp, lowerBr, and a bit of lowerCall.

Allow instructions to be predicated and use that in lower icmp and branch. Tracking the predicate for almost every instruction is a bit overkill, but technically possible. Add that to most of the instruction constructors except ret and call for now. This doesn't yet do compare + branch fusing, but it does handle the branch fallthrough to avoid branching twice. I can't yet test 8bit and 16bit, since those come from "trunc" and "trunc" is not lowered yet (or load, which also isn't handled yet). Adds basic "call(void)" lowering, just to get the call markers showing up in tests. 64bit.pnacl.ll no longer explodes with liveness consistency errors, so risk running that and backfill some of the 64bit arith tests. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1151663004

Subzero ARM: do lowerIcmp, lowerBr, and a bit of lowerCall.
3bfd99a3 · Jan Voung · e94740a0 · 3bfd99a3 · 3bfd99a3 · 3bfd99a3
Commit 3bfd99a3 authored May 22, 2015 by Jan Voung
12 changed files
--- a/src/IceConditionCodesARM32.h
+++ b/src/IceConditionCodesARM32.h
+//===- subzero/src/IceConditionCodesARM32.h - Condition Codes ---*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the condition codes for ARM32.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICECONDITIONCODESARM32_H
+#define SUBZERO_SRC_ICECONDITIONCODESARM32_H
+
+#include "IceDefs.h"
+#include "IceInstARM32.def"
+
+namespace Ice {
+
+class CondARM32 {
+  CondARM32() = delete;
+  CondARM32(const CondARM32 &) = delete;
+  CondARM32 &operator=(const CondARM32 &) = delete;
+
+public:
+  // An enum of codes used for conditional instructions. The enum value
+  // should match the value used to encode operands in binary instructions.
+  enum Cond {
+#define X(tag, encode, opp, emit) tag = encode,
+    ICEINSTARM32COND_TABLE
+#undef X
+  };
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICECONDITIONCODESARM32_H
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -91,4 +91,27 @@
  X(RRX, "rrx")                                                         \
 //#define X(tag, emit)

+// Attributes for the condition code 4-bit encoding (that is independent
+// of the APSR's NZCV fields). For example, EQ is 0, but corresponds to
+// Z = 1, and NE is 1, but corresponds to Z = 0.
+#define ICEINSTARM32COND_TABLE                                          \
+  /* enum value, encoding, opposite, emit */                            \
+  X(EQ, 0, NE, "eq") /* equal */                                        \
+  X(NE, 1, EQ, "ne") /* not equal */                                    \
+  X(CS, 2, CC, "cs") /* carry set/unsigned (AKA hs: higher or same) */  \
+  X(CC, 3, CS, "cc") /* carry clear/unsigned (AKA lo: lower) */         \
+  X(MI, 4, PL, "mi") /* minus/negative */                               \
+  X(PL, 5, MI, "pl") /* plus/positive or zero */                        \
+  X(VS, 6, VC, "vs") /* overflow (float unordered) */                   \
+  X(VC, 7, VS, "vc") /* no overflow (float not unordered) */            \
+  X(HI, 8, LS, "hi") /* unsigned higher */                              \
+  X(LS, 9, HI, "ls") /* unsigned lower or same */                       \
+  X(GE, 10, LT, "ge") /* signed greater than or equal */                \
+  X(LT, 11, GE, "lt") /* signed less than */                            \
+  X(GT, 12, LE, "gt") /* signed greater than */                         \
+  X(LE, 13, GT, "le") /* signed less than or equal */                   \
+  X(AL, 14, kNone, "") /* always (unconditional) */                     \
+  X(kNone, 15, kNone, "??") /* special condition / none */              \
+//#define(tag, encode, opp, emit)
+
 #endif // SUBZERO_SRC_ICEINSTARM32_DEF
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -583,8 +583,8 @@ void InstX8632Call::dump(const Cfg *Func) const {
 // shift instructions, in order to be syntactically valid.  The
 // Opcode parameter needs to be char* and not IceString because of
 // template issues.
-void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
-                    bool ShiftHack) {
+void InstX8632::emitTwoAddress(const char *Opcode, const Inst *Inst,
+                               const Cfg *Func, bool ShiftHack) {
  if (!ALLOW_DUMP)
    return;
  Ostream &Str = Func->getContext()->getStrEmit();
@@ -703,9 +703,9 @@ void emitIASAsAddrOpTyGPR(
  }
 }

-void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
-                     const Operand *Src,
-                     const X8632::AssemblerX8632::GPREmitterShiftOp &Emitter) {
+void InstX8632::emitIASGPRShift(
+    const Cfg *Func, Type Ty, const Variable *Var, const Operand *Src,
+    const X8632::AssemblerX8632::GPREmitterShiftOp &Emitter) {
  X8632::AssemblerX8632 *Asm = Func->getAssembler<X8632::AssemblerX8632>();
  // Technically, the Dest Var can be mem as well, but we only use Reg.
  // We can extend this to check Dest if we decide to use that form.

--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -17,9 +17,9 @@
 #define SUBZERO_SRC_ICEINSTX8632_H

 #include "assembler_ia32.h"
+#include "IceConditionCodesX8632.h"
 #include "IceDefs.h"
 #include "IceInst.h"
-#include "IceConditionCodesX8632.h"
 #include "IceInstX8632.def"
 #include "IceOperand.h"

@@ -268,6 +268,17 @@ public:
  static CondX86::BrCond getOppositeCondition(CondX86::BrCond Cond);
  void dump(const Cfg *Func) const override;

+  // Shared emit routines for common forms of instructions.
+  // See the definition of emitTwoAddress() for a description of
+  // ShiftHack.
+  static void emitTwoAddress(const char *Opcode, const Inst *Inst,
+                             const Cfg *Func, bool ShiftHack = false);
+
+  static void
+  emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
+                  const Operand *Src,
+                  const X8632::AssemblerX8632::GPREmitterShiftOp &Emitter);
+
 protected:
  InstX8632(Cfg *Func, InstKindX8632 Kind, SizeT Maxsrcs, Variable *Dest)
      : InstTarget(Func, static_cast<InstKind>(Kind), Maxsrcs, Dest) {}
@@ -665,15 +676,6 @@ private:
  static const X8632::AssemblerX8632::XmmEmitterRegOp Emitter;
 };

-// See the definition of emitTwoAddress() for a description of
-// ShiftHack.
-void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
-                    bool ShiftHack = false);
-
-void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
-                     const Operand *Src,
-                     const X8632::AssemblerX8632::GPREmitterShiftOp &Emitter);
-
 template <InstX8632::InstKindX8632 K>
 class InstX8632BinopGPRShift : public InstX8632 {
  InstX8632BinopGPRShift() = delete;

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
--- a/src/IceTargetLoweringARM32.def
+++ b/src/IceTargetLoweringARM32.def
@@ -15,6 +15,19 @@
 #ifndef SUBZERO_SRC_ICETARGETLOWERINGARM32_DEF
 #define SUBZERO_SRC_ICETARGETLOWERINGARM32_DEF

-// TODO(jvoung): Fill out comparison tables, etc. for 32/64-bit compares.
+// Patterns for lowering icmp.
+#define ICMPARM32_TABLE                                             \
+  /* val, is_signed, swapped64, C_32, C1_64, C2_64 */               \
+  X(Eq,   false,     false,     EQ,   EQ,    NE)                    \
+  X(Ne,   false,     false,     NE,   NE,    EQ)                    \
+  X(Ugt,  false,     false,     HI,   HI,    LS)                    \
+  X(Uge,  false,     false,     CS,   CS,    CC)                    \
+  X(Ult,  false,     false,     CC,   CC,    CS)                    \
+  X(Ule,  false,     false,     LS,   LS,    HI)                    \
+  X(Sgt,  true,      true,      GT,   LT,    GE)                    \
+  X(Sge,  true,      false,     GE,   GE,    LT)                    \
+  X(Slt,  true,      false,     LT,   LT,    GE)                    \
+  X(Sle,  true,      true,      LE,   GE,    LT)                    \
+//#define X(val, is_signed, swapped64, C_32, C1_64, C2_64)

 #endif // SUBZERO_SRC_ICETARGETLOWERINGARM32_DEF
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -130,71 +130,119 @@ protected:
  // with minimal syntactic overhead, so that the lowering code can
  // look as close to assembly as practical.

-  void _add(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Add::create(Func, Dest, Src0, Src1));
+  void _add(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Add::create(Func, Dest, Src0, Src1, Pred));
  }
-  void _adds(Variable *Dest, Variable *Src0, Operand *Src1) {
+  void _adds(Variable *Dest, Variable *Src0, Operand *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
    const bool SetFlags = true;
-    Context.insert(InstARM32Add::create(Func, Dest, Src0, Src1, SetFlags));
+    Context.insert(
+        InstARM32Add::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
-  void _adc(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Adc::create(Func, Dest, Src0, Src1));
+  void _adc(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Adc::create(Func, Dest, Src0, Src1, Pred));
  }
-  void _and(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32And::create(Func, Dest, Src0, Src1));
+  void _and(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32And::create(Func, Dest, Src0, Src1, Pred));
  }
-  void _eor(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Eor::create(Func, Dest, Src0, Src1));
+  void _br(CondARM32::Cond Condition, CfgNode *TargetTrue,
+           CfgNode *TargetFalse) {
+    Context.insert(
+        InstARM32Br::create(Func, TargetTrue, TargetFalse, Condition));
  }
-  void _ldr(Variable *Dest, OperandARM32Mem *Addr) {
-    Context.insert(InstARM32Ldr::create(Func, Dest, Addr));
+  void _br(CfgNode *Target) {
+    Context.insert(InstARM32Br::create(Func, Target));
  }
-  void _mla(Variable *Dest, Variable *Src0, Variable *Src1, Variable *Acc) {
-    Context.insert(InstARM32Mla::create(Func, Dest, Src0, Src1, Acc));
+  void _cmp(Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Cmp::create(Func, Src0, Src1, Pred));
+  }
+  void _eor(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Eor::create(Func, Dest, Src0, Src1, Pred));
+  }
+  void _ldr(Variable *Dest, OperandARM32Mem *Addr,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Ldr::create(Func, Dest, Addr, Pred));
+  }
+  void _lsl(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Lsl::create(Func, Dest, Src0, Src1, Pred));
+  }
+  void _mla(Variable *Dest, Variable *Src0, Variable *Src1, Variable *Acc,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Mla::create(Func, Dest, Src0, Src1, Acc, Pred));
  }
  // If Dest=nullptr is passed in, then a new variable is created,
  // marked as infinite register allocation weight, and returned
  // through the in/out Dest argument.
  void _mov(Variable *&Dest, Operand *Src0,
+            CondARM32::Cond Pred = CondARM32::AL,
            int32_t RegNum = Variable::NoRegister) {
    if (Dest == nullptr)
      Dest = makeReg(Src0->getType(), RegNum);
-    Context.insert(InstARM32Mov::create(Func, Dest, Src0));
+    Context.insert(InstARM32Mov::create(Func, Dest, Src0, Pred));
+  }
+  void _mov_nonkillable(Variable *Dest, Operand *Src0,
+                        CondARM32::Cond Pred = CondARM32::AL) {
+    Inst *NewInst = InstARM32Mov::create(Func, Dest, Src0, Pred);
+    NewInst->setDestNonKillable();
+    Context.insert(NewInst);
  }
  // The Operand can only be a 16-bit immediate or a ConstantRelocatable
  // (with an upper16 relocation).
-  void _movt(Variable *Dest, Operand *Src0) {
-    Context.insert(InstARM32Movt::create(Func, Dest, Src0));
+  void _movt(Variable *Dest, Operand *Src0,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Movt::create(Func, Dest, Src0, Pred));
+  }
+  void _movw(Variable *Dest, Operand *Src0,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Movw::create(Func, Dest, Src0, Pred));
  }
-  void _movw(Variable *Dest, Operand *Src0) {
-    Context.insert(InstARM32Movw::create(Func, Dest, Src0));
+  void _mul(Variable *Dest, Variable *Src0, Variable *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Mul::create(Func, Dest, Src0, Src1, Pred));
  }
-  void _mul(Variable *Dest, Variable *Src0, Variable *Src1) {
-    Context.insert(InstARM32Mul::create(Func, Dest, Src0, Src1));
+  void _mvn(Variable *Dest, Operand *Src0,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Mvn::create(Func, Dest, Src0, Pred));
  }
-  void _mvn(Variable *Dest, Operand *Src0) {
-    Context.insert(InstARM32Mvn::create(Func, Dest, Src0));
+  void _orr(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Orr::create(Func, Dest, Src0, Src1, Pred));
  }
-  void _orr(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Orr::create(Func, Dest, Src0, Src1));
+  void _sbc(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Sbc::create(Func, Dest, Src0, Src1, Pred));
  }
-  void _sbc(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Sbc::create(Func, Dest, Src0, Src1));
+  void _sbcs(Variable *Dest, Variable *Src0, Operand *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    const bool SetFlags = true;
+    Context.insert(
+        InstARM32Sbc::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
-  void _sub(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Sub::create(Func, Dest, Src0, Src1));
+  void _sub(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Sub::create(Func, Dest, Src0, Src1, Pred));
  }
-  void _subs(Variable *Dest, Variable *Src0, Operand *Src1) {
+  void _subs(Variable *Dest, Variable *Src0, Operand *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
    const bool SetFlags = true;
-    Context.insert(InstARM32Sub::create(Func, Dest, Src0, Src1, SetFlags));
+    Context.insert(
+        InstARM32Sub::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
  void _ret(Variable *LR, Variable *Src0 = nullptr) {
    Context.insert(InstARM32Ret::create(Func, LR, Src0));
  }
  void _umull(Variable *DestLo, Variable *DestHi, Variable *Src0,
-              Variable *Src1) {
-    Context.insert(InstARM32Umull::create(Func, DestLo, DestHi, Src0, Src1));
+              Variable *Src1, CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(
+        InstARM32Umull::create(Func, DestLo, DestHi, Src0, Src1, Pred));
    // Model the modification to the second dest as a fake def.
+    // Note that the def is not predicated.
    Context.insert(InstFakeDef::create(Func, DestHi, DestLo));
  }


--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -2,10 +2,20 @@
 ; particular the patterns for lowering i64 operations into constituent
 ; i32 operations on x86-32.

-; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 \
-; RUN:   | FileCheck %s
-; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 \
-; RUN:   | FileCheck --check-prefix=OPTM1 %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck --check-prefix=OPTM1 %s
+
+; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
+; once enough infrastructure is in. Also, switch to --filetype=obj
+; when possible.
+; RUN: %if --need=target_ARM32 --command %p2i --filetype=asm --assemble \
+; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --command FileCheck --check-prefix ARM32 %s

 @__init_array_start = internal constant [0 x i8] zeroinitializer, align 4
 @__fini_array_start = internal constant [0 x i8] zeroinitializer, align 4
@@ -114,6 +124,10 @@ entry:
 ; OPTM1: mov     {{.*}},DWORD PTR [esp+0x4]
 ; OPTM1: mov     {{.*}},DWORD PTR [esp+0x8]

+; Nothing to do for ARM O2 -- arg and return value are in r0,r1.
+; ARM32-LABEL: return64BitArg
+; ARM32-NEXT: bx lr
+
 define internal i64 @return64BitConst() {
 entry:
  ret i64 -2401053092306725256
@@ -126,6 +140,12 @@ entry:
 ; OPTM1: mov     eax,0x12345678
 ; OPTM1: mov     edx,0xdeadbeef

+; ARM32-LABEL: return64BitConst
+; ARM32: movw r0, #22136 ; 0x5678
+; ARM32: movt r0, #4660  ; 0x1234
+; ARM32: movw r1, #48879 ; 0xbeef
+; ARM32: movt r1, #57005 ; 0xdead
+
 define internal i64 @add64BitSigned(i64 %a, i64 %b) {
 entry:
  %add = add i64 %b, %a
@@ -139,6 +159,10 @@ entry:
 ; OPTM1: add
 ; OPTM1: adc

+; ARM32-LABEL: add64BitSigned
+; ARM32: adds
+; ARM32: adc
+
 define internal i64 @add64BitUnsigned(i64 %a, i64 %b) {
 entry:
  %add = add i64 %b, %a
@@ -152,6 +176,10 @@ entry:
 ; OPTM1: add
 ; OPTM1: adc

+; ARM32-LABEL: add64BitUnsigned
+; ARM32: adds
+; ARM32: adc
+
 define internal i64 @sub64BitSigned(i64 %a, i64 %b) {
 entry:
  %sub = sub i64 %a, %b
@@ -165,6 +193,10 @@ entry:
 ; OPTM1: sub
 ; OPTM1: sbb

+; ARM32-LABEL: sub64BitSigned
+; ARM32: subs
+; ARM32: sbc
+
 define internal i64 @sub64BitUnsigned(i64 %a, i64 %b) {
 entry:
  %sub = sub i64 %a, %b
@@ -178,6 +210,10 @@ entry:
 ; OPTM1: sub
 ; OPTM1: sbb

+; ARM32-LABEL: sub64BitUnsigned
+; ARM32: subs
+; ARM32: sbc
+
 define internal i64 @mul64BitSigned(i64 %a, i64 %b) {
 entry:
  %mul = mul i64 %b, %a
@@ -197,6 +233,12 @@ entry:
 ; OPTM1: add
 ; OPTM1: add

+; ARM32-LABEL: mul64BitSigned
+; ARM32: mul
+; ARM32: mla
+; ARM32: umull
+; ARM32: add
+
 define internal i64 @mul64BitUnsigned(i64 %a, i64 %b) {
 entry:
  %mul = mul i64 %b, %a
@@ -216,6 +258,12 @@ entry:
 ; OPTM1: add
 ; OPTM1: add

+; ARM32-LABEL: mul64BitUnsigned
+; ARM32: mul
+; ARM32: mla
+; ARM32: umull
+; ARM32: add
+
 define internal i64 @div64BitSigned(i64 %a, i64 %b) {
 entry:
  %div = sdiv i64 %a, %b
@@ -413,6 +461,10 @@ entry:
 ; OPTM1: and
 ; OPTM1: and

+; ARM32-LABEL: and64BitSigned
+; ARM32: and
+; ARM32: and
+
 define internal i64 @and64BitUnsigned(i64 %a, i64 %b) {
 entry:
  %and = and i64 %b, %a
@@ -426,6 +478,10 @@ entry:
 ; OPTM1: and
 ; OPTM1: and

+; ARM32-LABEL: and64BitUnsigned
+; ARM32: and
+; ARM32: and
+
 define internal i64 @or64BitSigned(i64 %a, i64 %b) {
 entry:
  %or = or i64 %b, %a
@@ -439,6 +495,10 @@ entry:
 ; OPTM1: or
 ; OPTM1: or

+; ARM32-LABEL: or64BitSigned
+; ARM32: orr
+; ARM32: orr
+
 define internal i64 @or64BitUnsigned(i64 %a, i64 %b) {
 entry:
  %or = or i64 %b, %a
@@ -452,6 +512,10 @@ entry:
 ; OPTM1: or
 ; OPTM1: or

+; ARM32-LABEL: or64BitUnsigned
+; ARM32: orr
+; ARM32: orr
+
 define internal i64 @xor64BitSigned(i64 %a, i64 %b) {
 entry:
  %xor = xor i64 %b, %a
@@ -465,6 +529,10 @@ entry:
 ; OPTM1: xor
 ; OPTM1: xor

+; ARM32-LABEL: xor64BitSigned
+; ARM32: eor
+; ARM32: eor
+
 define internal i64 @xor64BitUnsigned(i64 %a, i64 %b) {
 entry:
  %xor = xor i64 %b, %a
@@ -478,6 +546,10 @@ entry:
 ; OPTM1: xor
 ; OPTM1: xor

+; ARM32-LABEL: xor64BitUnsigned
+; ARM32: eor
+; ARM32: eor
+
 define internal i32 @trunc64To32Signed(i64 %a) {
 entry:
  %conv = trunc i64 %a to i32
@@ -746,6 +818,20 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; OPTM1: je
 ; OPTM1: call

+; ARM32-LABEL: icmpEq64
+; ARM32: cmp
+; ARM32: cmpeq
+; ARM32: moveq
+; ARM32: movne
+; ARM32: beq
+; ARM32: bl
+; ARM32: cmp
+; ARM32: cmpeq
+; ARM32: moveq
+; ARM32: movne
+; ARM32: beq
+; ARM32: bl
+
 declare void @func()

 define internal void @icmpNe64(i64 %a, i64 %b, i64 %c, i64 %d) {
@@ -784,6 +870,20 @@ if.end3:                                          ; preds = %if.end, %if.then2
 ; OPTM1: jne
 ; OPTM1: call

+; ARM32-LABEL: icmpNe64
+; ARM32: cmp
+; ARM32: cmpeq
+; ARM32: movne
+; ARM32: moveq
+; ARM32: beq
+; ARM32: bl
+; ARM32: cmp
+; ARM32: cmpeq
+; ARM32: movne
+; ARM32: moveq
+; ARM32: beq
+; ARM32: bl
+
 define internal void @icmpGt64(i64 %a, i64 %b, i64 %c, i64 %d) {
 entry:
  %cmp = icmp ugt i64 %a, %b
@@ -824,6 +924,20 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; OPTM1: ja
 ; OPTM1: call

+; ARM32-LABEL: icmpGt64
+; ARM32: cmp
+; ARM32: cmpeq
+; ARM32: movhi
+; ARM32: movls
+; ARM32: beq
+; ARM32: bl
+; ARM32: cmp
+; ARM32: sbcs
+; ARM32: movlt
+; ARM32: movge
+; ARM32: beq
+; ARM32: bl
+
 define internal void @icmpGe64(i64 %a, i64 %b, i64 %c, i64 %d) {
 entry:
  %cmp = icmp uge i64 %a, %b
@@ -864,6 +978,20 @@ if.end3:                                          ; preds = %if.end, %if.then2
 ; OPTM1: jae
 ; OPTM1: call

+; ARM32-LABEL: icmpGe64
+; ARM32: cmp
+; ARM32: cmpeq
+; ARM32: movcs
+; ARM32: movcc
+; ARM32: beq
+; ARM32: bl
+; ARM32: cmp
+; ARM32: sbcs
+; ARM32: movge
+; ARM32: movlt
+; ARM32: beq
+; ARM32: bl
+
 define internal void @icmpLt64(i64 %a, i64 %b, i64 %c, i64 %d) {
 entry:
  %cmp = icmp ult i64 %a, %b
@@ -904,6 +1032,20 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; OPTM1: jb
 ; OPTM1: call

+; ARM32-LABEL: icmpLt64
+; ARM32: cmp
+; ARM32: cmpeq
+; ARM32: movcc
+; ARM32: movcs
+; ARM32: beq
+; ARM32: bl
+; ARM32: cmp
+; ARM32: sbcs
+; ARM32: movlt
+; ARM32: movge
+; ARM32: beq
+; ARM32: bl
+
 define internal void @icmpLe64(i64 %a, i64 %b, i64 %c, i64 %d) {
 entry:
  %cmp = icmp ule i64 %a, %b
@@ -944,6 +1086,20 @@ if.end3:                                          ; preds = %if.end, %if.then2
 ; OPTM1: jbe
 ; OPTM1: call

+; ARM32-LABEL: icmpLe64
+; ARM32: cmp
+; ARM32: cmpeq
+; ARM32: movls
+; ARM32: movhi
+; ARM32: beq
+; ARM32: bl
+; ARM32: cmp
+; ARM32: sbcs
+; ARM32: movge
+; ARM32: movlt
+; ARM32: beq
+; ARM32: bl
+
 define internal i32 @icmpEq64Bool(i64 %a, i64 %b) {
 entry:
  %cmp = icmp eq i64 %a, %b
@@ -1275,6 +1431,8 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; CHECK-NOT: cmp 0x{{[0-9a-f]+}},
 ; OPTM1-LABEL: icmpEq64Imm
 ; OPTM1-LABEL-NOT: cmp 0x{{[0-9a-f]+}},
+; ARM32-LABEL: icmpEq64Imm
+; ARM32-NOT: cmp #{{[0-9a-f]+}},

 define internal void @icmpLt64Imm() {
 entry:
@@ -1302,3 +1460,5 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; CHECK-NOT: cmp 0x{{[0-9a-f]+}},
 ; OPTM1-LABEL: icmpLt64Imm
 ; OPTM1-NOT: cmp 0x{{[0-9a-f]+}},
+; ARM32-LABEL: icmpLt64Imm
+; ARM32-NOT: cmp #{{[0-9a-f]+}},
--- a/tests_lit/llvm2ice_tests/branch-opt.ll
+++ b/tests_lit/llvm2ice_tests/branch-opt.ll
 ; Tests the branch optimizations under O2 (against a lack of
 ; optimizations under Om1).

-; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 \
-; RUN:   | FileCheck --check-prefix=O2 %s
-; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 \
-; RUN:   | FileCheck --check-prefix=OM1 %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck --check-prefix=O2 %s
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck --check-prefix=OM1 %s
+
+; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
+; once enough infrastructure is in. Also, switch to --filetype=obj
+; when possible.
+; Also test Om1 when addProlog is done.
+; RUN: %if --need=target_ARM32 --command %p2i --filetype=asm --assemble \
+; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --command FileCheck --check-prefix ARM32O2 %s

 declare void @dummy()

@@ -29,6 +40,10 @@ next:
 ; OM1-NEXT: jmp
 ; OM1: call

+; ARM32O2-LABEL: testUncondToNextBlock
+; ARM32O2: bl {{.*}} dummy
+; ARM32O2-NEXT: bl {{.*}} dummy
+
 ; For a conditional branch with a fallthrough to the next block, the
 ; fallthrough branch should be removed.
 define void @testCondFallthroughToNextBlock(i32 %arg) {
@@ -62,6 +77,17 @@ target:
 ; OM1: call
 ; OM1: ret

+; Note that compare and branch folding isn't implemented yet (unlike x86-32).
+; ARM32O2-LABEL: testCondFallthroughToNextBlock
+; ARM32O2: cmp {{.*}}, #123
+; ARM32O2-NEXT: movge {{.*}}, #1
+; ARM32O2-NEXT: cmp {{.*}}, #0
+; ARM32O2-NEXT: bne
+; ARM32O2-NEXT: bl
+; ARM32O2-NEXT: bx lr
+; ARM32O2-NEXT: bl
+; ARM32O2-NEXT: bx lr
+
 ; For a conditional branch with the next block as the target and a
 ; different block as the fallthrough, the branch condition should be
 ; inverted, the fallthrough block changed to the target, and the
@@ -96,3 +122,15 @@ target:
 ; OM1: ret
 ; OM1: call
 ; OM1: ret
+
+; Note that compare and branch folding isn't implemented yet
+; (compared to x86-32).
+; ARM32O2-LABEL: testCondTargetNextBlock
+; ARM32O2: cmp {{.*}}, #123
+; ARM32O2-NEXT: movge {{.*}}, #1
+; ARM32O2-NEXT: cmp {{.*}}, #0
+; ARM32O2-NEXT: beq
+; ARM32O2-NEXT: bl
+; ARM32O2-NEXT: bx lr
+; ARM32O2-NEXT: bl
+; ARM32O2-NEXT: bx lr
--- a/tests_lit/llvm2ice_tests/int-arg.ll
+++ b/tests_lit/llvm2ice_tests/int-arg.ll
 ; This file checks that Subzero generates code in accordance with the
 ; calling convention for integers.

-; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s

 ; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
 ; once enough infrastructure is in. Also, switch to --filetype=obj