Handle ARM "ret void" and function alignment with proper padding.

Modify run-pnacl-sz to pass in the correct assembler/disasembler flags for ARM when not using the integrated assembler. Model the "ret" pseudo instruction (special form of "bx" inst). Separate from "bx" to allow epilogue insertion to find the terminator. Add a flag "--skip-unimplemented" to skip through all of the "Not yet implemented" assertions, and use that in the test. Set up a stack trace printer when ALLOW_DUMP so that the UnimplementedError prints out some useful information of *which* case is unimplemented. Change the .type ...,@function from @function to %function. ARM assembler seems to only like %function because "@" is a comment character. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1136793002

Handle ARM "ret void" and function alignment with proper padding.
b2d5084c · Jan Voung · b33a2af2 · b2d5084c · b2d5084c · b2d5084c
Commit b2d5084c authored May 12, 2015 by Jan Voung
17 changed files
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -177,6 +177,7 @@ SRCS = \
 	IceGlobalContext.cpp \
 	IceGlobalInits.cpp \
 	IceInst.cpp \
+	IceInstARM32.cpp \
 	IceInstX8632.cpp \
 	IceIntrinsics.cpp \
 	IceLiveness.cpp \

--- a/pydir/run-pnacl-sz.py
+++ b/pydir/run-pnacl-sz.py
@@ -10,6 +10,22 @@ import tempfile

 from utils import shellcmd

+
+def TargetAssemblerFlags(target):
+  # TODO(stichnot): -triple=i686-nacl should be used for a
+  # sandboxing test.  This means there should be an args.sandbox
+  # argument that also gets passed through to pnacl-sz.
+  flags = { 'x8632': ['-triple=i686'],
+            'arm32': ['-triple=armv7a', '-mcpu=cortex-a9', '-mattr=+neon'] }
+  return flags[target]
+
+
+def TargetDisassemblerFlags(target):
+  flags = { 'x8632': ['-Mintel'],
+            'arm32': [] }
+  return flags[target]
+
+
 def main():
    """Run the pnacl-sz compiler on an llvm file.

@@ -56,6 +72,9 @@ def main():
    argparser.add_argument('--filetype', default='iasm', dest='filetype',
                           choices=['obj', 'asm', 'iasm'],
                           help='Output file type.  Default %(default)s.')
+    argparser.add_argument('--target', default='x8632', dest='target',
+                           choices=['x8632','arm32'],
+                           help='Target architecture.  Default %(default)s.')
    argparser.add_argument('--echo-cmd', required=False,
                           action='store_true',
                           help='Trace command that generates ICE instructions')
@@ -82,6 +101,7 @@ def main():
        cmd += ['--allow-local-symbol-tables']
      cmd += ['|']
    cmd += [args.pnacl_sz]
+    cmd += ['--target', args.target]
    if args.insts:
      # If the tests are based on '-verbose inst' output, force
      # single-threaded translation because dump output does not get
@@ -107,19 +127,17 @@ def main():
      asm_temp = tempfile.NamedTemporaryFile(delete=False)
      asm_temp.close()
    if args.assemble and args.filetype != 'obj':
-      cmd += ['|', os.path.join(pnacl_bin_path, 'llvm-mc'),
-              # TODO(stichnot): -triple=i686-nacl should be used for a
-              # sandboxing test.  This means there should be an args.sandbox
-              # argument that also gets passed through to pnacl-sz.
-              '-triple=i686',
-              '-filetype=obj', '-o', asm_temp.name]
+      cmd += (['|', os.path.join(pnacl_bin_path, 'llvm-mc')] +
+              TargetAssemblerFlags(args.target) +
+              ['-filetype=obj', '-o', asm_temp.name])
    elif asm_temp:
      cmd += ['-o', asm_temp.name]
    if args.disassemble:
      # Show wide instruction encodings, diassemble, and show relocs.
      cmd += (['&&', os.path.join(pnacl_bin_path, 'le32-nacl-objdump')] +
              args.dis_flags +
-              ['-w', '-d', '-r', '-Mintel', asm_temp.name])
+              ['-w', '-d', '-r'] + TargetDisassemblerFlags(args.target) +
+              [asm_temp.name])

    stdout_result = shellcmd(cmd, echo=args.echo_cmd)
    if not args.echo_cmd:

--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -464,9 +464,10 @@ void Cfg::emitTextHeader(const IceString &MangledName, GlobalContext *Ctx,
    Str << "\t.section\t.text." << MangledName << ",\"ax\",@progbits\n";
  if (!Asm->getInternal() || Ctx->getFlags().getDisableInternal()) {
    Str << "\t.globl\t" << MangledName << "\n";
-    Str << "\t.type\t" << MangledName << ",@function\n";
+    Str << "\t.type\t" << MangledName << ",%function\n";
  }
-  Str << "\t.p2align " << Asm->getBundleAlignLog2Bytes() << ",0x";
+  Str << "\t" << Asm->getNonExecPadDirective() << " "
+      << Asm->getBundleAlignLog2Bytes() << ",0x";
  for (uint8_t I : Asm->getNonExecBundlePadding())
    Str.write_hex(I);
  Str << "\n";

--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -111,6 +111,11 @@ cl::opt<bool>
                                cl::desc("Randomize register allocation"),
                                cl::init(false));

+cl::opt<bool> SkipUnimplemented(
+    "skip-unimplemented",
+    cl::desc("Skip through unimplemented lowering code instead of aborting."),
+    cl::init(false));
+
 cl::opt<bool> SubzeroTimingEnabled(
    "timing", cl::desc("Enable breakdown timing of Subzero translation"));

@@ -260,6 +265,7 @@ void ClFlags::resetClFlags(ClFlags &OutFlags) {
  OutFlags.PhiEdgeSplit = false;
  OutFlags.RandomNopInsertion = false;
  OutFlags.RandomRegAlloc = false;
+  OutFlags.SkipUnimplemented = false;
  OutFlags.SubzeroTimingEnabled = false;
  OutFlags.TimeEachFunction = false;
  OutFlags.UseSandboxing = false;
@@ -311,6 +317,7 @@ void ClFlags::getParsedClFlags(ClFlags &OutFlags) {
  OutFlags.setRandomSeed(::RandomSeed);
  OutFlags.setShouldDoNopInsertion(::ShouldDoNopInsertion);
  OutFlags.setShouldRandomizeRegAlloc(::RandomizeRegisterAllocation);
+  OutFlags.setSkipUnimplemented(::SkipUnimplemented);
  OutFlags.setSubzeroTimingEnabled(::SubzeroTimingEnabled);
  OutFlags.setTargetArch(::TargetArch);
  OutFlags.setTargetInstructionSet(::TargetInstructionSet);

--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -86,6 +86,9 @@ public:
  bool shouldRandomizeRegAlloc() const { return RandomRegAlloc; }
  void setShouldRandomizeRegAlloc(bool NewValue) { RandomRegAlloc = NewValue; }

+  bool getSkipUnimplemented() const { return SkipUnimplemented; }
+  void setSkipUnimplemented(bool NewValue) { SkipUnimplemented = NewValue; }
+
  bool getSubzeroTimingEnabled() const { return SubzeroTimingEnabled; }
  void setSubzeroTimingEnabled(bool NewValue) {
    SubzeroTimingEnabled = NewValue;
@@ -184,6 +187,7 @@ private:
  bool PhiEdgeSplit;
  bool RandomNopInsertion;
  bool RandomRegAlloc;
+  bool SkipUnimplemented;
  bool SubzeroTimingEnabled;
  bool TimeEachFunction;
  bool UseSandboxing;

--- a/src/IceCompileServer.cpp
+++ b/src/IceCompileServer.cpp
@@ -17,6 +17,7 @@

 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/StreamingMemoryObject.h"

@@ -49,6 +50,9 @@ ErrorCodes getReturnValue(const Ice::ClFlagsExtra &Flags, ErrorCodes Val) {
 } // end of anonymous namespace

 void CLCompileServer::run() {
+  if (ALLOW_DUMP) {
+    llvm::sys::PrintStackTraceOnErrorSignal();
+  }
  ClFlags::parseFlags(argc, argv);
  ClFlags Flags;
  ClFlagsExtra ExtraFlags;

--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
+//===- subzero/src/IceInstARM32.cpp - ARM32 instruction implementation ----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the InstARM32 and OperandARM32 classes,
+// primarily the constructors and the dump()/emit() methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "assembler_arm32.h"
+#include "IceCfg.h"
+#include "IceCfgNode.h"
+#include "IceInst.h"
+#include "IceInstARM32.h"
+#include "IceOperand.h"
+#include "IceRegistersARM32.h"
+#include "IceTargetLoweringARM32.h"
+
+namespace Ice {
+
+namespace {
+
+const struct TypeARM32Attributes_ {
+  const char *WidthString; // b, h, <blank>, or d
+  int8_t SExtAddrOffsetBits;
+  int8_t ZExtAddrOffsetBits;
+} TypeARM32Attributes[] = {
+#define X(tag, elementty, width, sbits, ubits)                                 \
+  { width, sbits, ubits }                                                      \
+  ,
+    ICETYPEARM32_TABLE
+#undef X
+};
+
+} // end of anonymous namespace
+
+const char *InstARM32::getWidthString(Type Ty) {
+  return TypeARM32Attributes[Ty].WidthString;
+}
+
+bool OperandARM32Mem::canHoldOffset(Type Ty, bool SignExt, int32_t Offset) {
+  int32_t Bits = SignExt ? TypeARM32Attributes[Ty].SExtAddrOffsetBits
+                         : TypeARM32Attributes[Ty].ZExtAddrOffsetBits;
+  if (Bits == 0)
+    return Offset == 0;
+  // Note that encodings for offsets are sign-magnitude for ARM, so we check
+  // with IsAbsoluteUint().
+  if (isScalarFloatingType(Ty))
+    return Utils::IsAligned(Offset, 4) && Utils::IsAbsoluteUint(Bits, Offset);
+  return Utils::IsAbsoluteUint(Bits, Offset);
+}
+
+InstARM32Ret::InstARM32Ret(Cfg *Func, Variable *LR, Variable *Source)
+    : InstARM32(Func, InstARM32::Ret, Source ? 2 : 1, nullptr) {
+  addSource(LR);
+  if (Source)
+    addSource(Source);
+}
+
+// ======================== Dump routines ======================== //
+
+void InstARM32::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "[ARM32] ";
+  Inst::dump(Func);
+}
+
+void InstARM32Ret::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  assert(getSrcSize() > 0);
+  Variable *LR = llvm::cast<Variable>(getSrc(0));
+  assert(LR->hasReg());
+  assert(LR->getRegNum() == RegARM32::Reg_lr);
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\tbx\t";
+  LR->emit(Func);
+}
+
+void InstARM32Ret::emitIAS(const Cfg *Func) const {
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Ret::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty = (getSrcSize() == 1 ? IceType_void : getSrc(0)->getType());
+  Str << "ret." << Ty << " ";
+  dumpSources(Func);
+}
+
+} // end of namespace Ice
--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -61,5 +61,24 @@

 // TODO(jvoung): add condition code tables, etc.

+// Load/Store instruction width suffixes.
+#define ICETYPEARM32_TABLE                                              \
+  /* tag,          element type, width, addr off bits sext, zext */     \
+  X(IceType_void,  IceType_void, "",  0, 0)                             \
+  X(IceType_i1,    IceType_void, "b", 8, 12)                            \
+  X(IceType_i8,    IceType_void, "b", 8, 12)                            \
+  X(IceType_i16,   IceType_void, "h", 8, 8)                             \
+  X(IceType_i32,   IceType_void, "", 12, 12)                            \
+  X(IceType_i64,   IceType_void, "d", 8, 8)                             \
+  X(IceType_f32,   IceType_void, "", 10, 10)                            \
+  X(IceType_f64,   IceType_void, "", 10, 10)                            \
+  X(IceType_v4i1,  IceType_i32 , "",  0,  0)                            \
+  X(IceType_v8i1,  IceType_i16 , "",  0,  0)                            \
+  X(IceType_v16i1, IceType_i8  , "",  0,  0)                            \
+  X(IceType_v16i8, IceType_i8  , "",  0,  0)                            \
+  X(IceType_v8i16, IceType_i16 , "",  0,  0)                            \
+  X(IceType_v4i32, IceType_i32 , "",  0,  0)                            \
+  X(IceType_v4f32, IceType_f32 , "",  0,  0)                            \
+//#define X(tag, elementty, width, sbits, ubits)

 #endif // SUBZERO_SRC_ICEINSTARM32_DEF
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -17,11 +17,104 @@
 #define SUBZERO_SRC_ICEINSTARM32_H

 #include "IceDefs.h"
+#include "IceInst.h"
+#include "IceInstARM32.def"
+#include "IceOperand.h"

 namespace Ice {

 class TargetARM32;
-// Fill this in.
+
+// OperandARM32 extends the Operand hierarchy.
+// TODO(jvoung): Add the OperandARM32Mem and OperandARM32Flex.
+class OperandARM32 : public Operand {
+  OperandARM32() = delete;
+  OperandARM32(const OperandARM32 &) = delete;
+  OperandARM32 &operator=(const OperandARM32 &) = delete;
+
+public:
+  enum OperandKindARM32 { k__Start = Operand::kTarget };
+
+  enum ShiftKind {
+    kNoShift = -1,
+#define X(enum, emit) enum,
+    ICEINSTARM32SHIFT_TABLE
+#undef X
+  };
+
+  using Operand::dump;
+  void dump(const Cfg *, Ostream &Str) const override {
+    if (ALLOW_DUMP)
+      Str << "<OperandARM32>";
+  }
+
+protected:
+  OperandARM32(OperandKindARM32 Kind, Type Ty)
+      : Operand(static_cast<OperandKind>(Kind), Ty) {}
+  ~OperandARM32() override {}
+};
+
+// OperandARM32Mem represents a memory operand in any of the various ARM32
+// addressing modes.
+// TODO(jvoung): Fill out more.
+class OperandARM32Mem : public OperandARM32 {
+  OperandARM32Mem() = delete;
+  OperandARM32Mem(const OperandARM32Mem &) = delete;
+  OperandARM32Mem &operator=(const OperandARM32Mem &) = delete;
+
+public:
+  // Return true if a load/store instruction for an element of type Ty
+  // can encode the Offset directly in the immediate field of the 32-bit
+  // ARM instruction. For some types, if the load is Sign extending, then
+  // the range is reduced.
+  static bool canHoldOffset(Type Ty, bool SignExt, int32_t Offset);
+};
+
+class InstARM32 : public InstTarget {
+  InstARM32() = delete;
+  InstARM32(const InstARM32 &) = delete;
+  InstARM32 &operator=(const InstARM32 &) = delete;
+
+public:
+  enum InstKindARM32 { k__Start = Inst::Target, Ret };
+
+  static const char *getWidthString(Type Ty);
+
+  void dump(const Cfg *Func) const override;
+
+protected:
+  InstARM32(Cfg *Func, InstKindARM32 Kind, SizeT Maxsrcs, Variable *Dest)
+      : InstTarget(Func, static_cast<InstKind>(Kind), Maxsrcs, Dest) {}
+  ~InstARM32() override {}
+  static bool isClassof(const Inst *Inst, InstKindARM32 MyKind) {
+    return Inst->getKind() == static_cast<InstKind>(MyKind);
+  }
+};
+
+// Ret pseudo-instruction.  This is actually a "bx" instruction with
+// an "lr" register operand, but epilogue lowering will search for a Ret
+// instead of a generic "bx". This instruction also takes a Source
+// operand (for non-void returning functions) for liveness analysis, though
+// a FakeUse before the ret would do just as well.
+class InstARM32Ret : public InstARM32 {
+  InstARM32Ret() = delete;
+  InstARM32Ret(const InstARM32Ret &) = delete;
+  InstARM32Ret &operator=(const InstARM32Ret &) = delete;
+
+public:
+  static InstARM32Ret *create(Cfg *Func, Variable *LR,
+                              Variable *Source = nullptr) {
+    return new (Func->allocate<InstARM32Ret>()) InstARM32Ret(Func, LR, Source);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Ret); }
+
+private:
+  InstARM32Ret(Cfg *Func, Variable *LR, Variable *Source);
+  ~InstARM32Ret() override {}
+};

 } // end of namespace Ice


--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -16,6 +16,7 @@
 #define SUBZERO_SRC_ICETARGETLOWERINGARM32_H

 #include "IceDefs.h"
+#include "IceInstARM32.h"
 #include "IceRegistersARM32.h"
 #include "IceTargetLowering.h"

@@ -91,6 +92,14 @@ protected:

  static Type stackSlotType();

+  // The following are helpers that insert lowered ARM32 instructions
+  // with minimal syntactic overhead, so that the lowering code can
+  // look as close to assembly as practical.
+
+  void _ret(Variable *LR, Variable *Src0 = nullptr) {
+    Context.insert(InstARM32Ret::create(Func, LR, Src0));
+  }
+
  bool UsesFramePointer;
  bool NeedsStackAlignment;
  llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -509,13 +509,13 @@ void TargetX8632::emitVariable(const Variable *Var) const {
  }
  if (Var->getWeight().isInf())
    llvm_unreachable("Infinite-weight Variable has no register assigned");
-  const Type Ty = IceType_i32;
  int32_t Offset = Var->getStackOffset();
  if (!hasFramePointer())
    Offset += getStackAdjustment();
  if (Offset)
    Str << Offset;
-  Str << "(%" << getRegName(getFrameOrStackReg(), Ty) << ")";
+  const Type FrameSPTy = IceType_i32;
+  Str << "(%" << getRegName(getFrameOrStackReg(), FrameSPTy) << ")";
 }

 X8632::Address TargetX8632::stackVarToAsmOperand(const Variable *Var) const {

--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -51,11 +51,26 @@ public:
    return (0 <= value) && (value < limit);
  }

+  // Check whether the magnitude of value fits in N bits, i.e., whether an
+  // (N+1)-bit sign-magnitude representation can hold value.
+  template <typename T> static inline bool IsAbsoluteUint(int N, T Value) {
+    assert((0 < N) &&
+           (static_cast<unsigned int>(N) < (CHAR_BIT * sizeof(Value))));
+    if (Value < 0)
+      Value = -Value;
+    return IsUint(N, Value);
+  }
+
  template <typename T> static inline bool WouldOverflowAdd(T X, T Y) {
    return ((X > 0 && Y > 0 && (X > std::numeric_limits<T>::max() - Y)) ||
            (X < 0 && Y < 0 && (X < std::numeric_limits<T>::min() - Y)));
  }

+  template <typename T> static inline bool IsAligned(T X, intptr_t N) {
+    assert(llvm::isPowerOf2_64(N));
+    return (X & (N - 1)) == 0;
+  }
+
  static inline uint64_t OffsetToAlignment(uint64_t Pos, uint64_t Align) {
    assert(llvm::isPowerOf2_64(Align));
    uint64_t Mod = Pos & (Align - 1);

--- a/src/assembler.h
+++ b/src/assembler.h
@@ -182,6 +182,7 @@ public:

  virtual SizeT getBundleAlignLog2Bytes() const = 0;

+  virtual const char *getNonExecPadDirective() const = 0;
  virtual llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const = 0;

  // Mark the current text location as the start of a CFG node

--- a/src/assembler_arm32.h
+++ b/src/assembler_arm32.h
@@ -42,29 +42,32 @@ public:
  }
  ~AssemblerARM32() override = default;

-  void alignFunction() override {
-    llvm::report_fatal_error("Not yet implemented.");
-  }
+  void alignFunction() override { llvm_unreachable("Not yet implemented."); }

  SizeT getBundleAlignLog2Bytes() const override { return 4; }

+  const char *getNonExecPadDirective() const override { return ".p2alignl"; }
+
  llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const override {
-    llvm::report_fatal_error("Not yet implemented.");
+    // Use a particular UDF encoding -- TRAPNaCl in LLVM: 0xE7FEDEF0
+    // http://llvm.org/viewvc/llvm-project?view=revision&revision=173943
+    static const uint8_t Padding[] = {0xE7, 0xFE, 0xDE, 0xF0};
+    return llvm::ArrayRef<uint8_t>(Padding, 4);
  }

  void padWithNop(intptr_t Padding) override {
    (void)Padding;
-    llvm::report_fatal_error("Not yet implemented.");
+    llvm_unreachable("Not yet implemented.");
  }

  void BindCfgNodeLabel(SizeT NodeNumber) override {
    (void)NodeNumber;
-    llvm::report_fatal_error("Not yet implemented.");
+    llvm_unreachable("Not yet implemented.");
  }

  bool fixupIsPCRel(FixupKind Kind) const override {
    (void)Kind;
-    llvm::report_fatal_error("Not yet implemented.");
+    llvm_unreachable("Not yet implemented.");
  }
 };


--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -352,6 +352,8 @@ public:

  SizeT getBundleAlignLog2Bytes() const override { return 5; }

+  const char *getNonExecPadDirective() const override { return ".p2align"; }
+
  llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const override {
    static const uint8_t Padding[] = {0xF4};
    return llvm::ArrayRef<uint8_t>(Padding, 1);

--- a/tests_lit/llvm2ice_tests/function_aligned.ll
+++ b/tests_lit/llvm2ice_tests/function_aligned.ll
@@ -4,6 +4,12 @@
 ; Also, we are currently using hlts for non-executable padding.

 ; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
+; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
+; once enough infrastructure is in. Also, switch to --filetype=obj
+; when possible.
+; RUN: %if --need=target_ARM32 --command %p2i --filetype=asm --assemble \
+; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --command FileCheck --check-prefix ARM32 %s

 define void @foo() {
  ret void
@@ -11,9 +17,16 @@ define void @foo() {
 ; CHECK-LABEL: foo
 ; CHECK-NEXT: 0: {{.*}} ret
 ; CHECK-NEXT: 1: {{.*}} hlt
+; ARM32-LABEL: foo
+; ARM32-NEXT: 0: {{.*}} bx lr
+; ARM32-NEXT: 4: e7fedef0 udf
+; ARM32-NEXT: 8: e7fedef0 udf
+; ARM32-NEXT: c: e7fedef0 udf

 define void @bar() {
  ret void
 }
 ; CHECK-LABEL: bar
 ; CHECK-NEXT: 20: {{.*}} ret
+; ARM32-LABEL: bar
+; ARM32-NEXT: 10: {{.*}} bx lr