Subzero: Add sandboxing for x86-32.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4079 R=jvoung@chromium.org, kschimpf@google.com Review URL: https://codereview.chromium.org/930733002

Subzero: Add sandboxing for x86-32.
9f42d8ca · Jim Stichnoth · a2703ae4 · 9f42d8ca · 9f42d8ca · 9f42d8ca
Commit 9f42d8ca authored Feb 20, 2015 by Jim Stichnoth
14 changed files
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -436,6 +436,8 @@ void Cfg::emitTextHeader(const IceString &MangledName, GlobalContext *Ctx,
  for (uint8_t I : Asm->getNonExecBundlePadding())
    Str.write_hex(I);
  Str << "\n";
+  if (Ctx->getFlags().getUseSandboxing())
+    Str << "\t.bundle_align_mode " << Asm->getBundleAlignLog2Bytes() << "\n";
  Str << MangledName << ":\n";
 }

--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -834,6 +834,8 @@ void emitLiveRangesEnded(Ostream &Str, const Cfg *Func, const Inst *Instr,
 }
 void updateStats(Cfg *Func, const Inst *I) {
+  if (!ALLOW_DUMP)
+    return;
  // Update emitted instruction count, plus fill/spill count for
  // Variable operands without a physical register.
  if (uint32_t Count = I->getEmitInstCount()) {
@@ -891,9 +893,127 @@ void CfgNode::emit(Cfg *Func) const {
    emitRegisterUsage(Str, Func, this, false, LiveRegCount);
 }
+// Helper class for emitIAS().
+namespace {
+class BundleEmitHelper {
+  BundleEmitHelper() = delete;
+  BundleEmitHelper(const BundleEmitHelper &) = delete;
+  BundleEmitHelper &operator=(const BundleEmitHelper &) = delete;
+public:
+  BundleEmitHelper(Assembler *Asm, const InstList &Insts)
+      : Asm(Asm), End(Insts.end()), BundleLockStart(End),
+        BundleSize(1 << Asm->getBundleAlignLog2Bytes()),
+        BundleMaskLo(BundleSize - 1), BundleMaskHi(~BundleMaskLo),
+        SizeSnapshotPre(0), SizeSnapshotPost(0) {}
+  // Check whether we're currently within a bundle_lock region.
+  bool isInBundleLockRegion() const { return BundleLockStart != End; }
+  // Check whether the current bundle_lock region has the align_to_end
+  // option.
+  bool isAlignToEnd() const {
+    assert(isInBundleLockRegion());
+    return llvm::cast<InstBundleLock>(getBundleLockStart())->getOption() ==
+           InstBundleLock::Opt_AlignToEnd;
+  }
+  // Check whether the entire bundle_lock region falls within the same
+  // bundle.
+  bool isSameBundle() const {
+    assert(isInBundleLockRegion());
+    return SizeSnapshotPre == SizeSnapshotPost ||
+           (SizeSnapshotPre & BundleMaskHi) ==
+               ((SizeSnapshotPost - 1) & BundleMaskHi);
+  }
+  // Get the bundle alignment of the first instruction of the
+  // bundle_lock region.
+  intptr_t getPreAlignment() const {
+    assert(isInBundleLockRegion());
+    return SizeSnapshotPre & BundleMaskLo;
+  }
+  // Get the bundle alignment of the first instruction past the
+  // bundle_lock region.
+  intptr_t getPostAlignment() const {
+    assert(isInBundleLockRegion());
+    return SizeSnapshotPost & BundleMaskLo;
+  }
+  // Get the iterator pointing to the bundle_lock instruction, e.g. to
+  // roll back the instruction iteration to that point.
+  InstList::const_iterator getBundleLockStart() const {
+    assert(isInBundleLockRegion());
+    return BundleLockStart;
+  }
+  // Set up bookkeeping when the bundle_lock instruction is first
+  // processed.
+  void enterBundleLock(InstList::const_iterator I) {
+    assert(!isInBundleLockRegion());
+    BundleLockStart = I;
+    SizeSnapshotPre = Asm->getBufferSize();
+    Asm->setPreliminary(true);
+    assert(isInBundleLockRegion());
+  }
+  // Update bookkeeping when the bundle_unlock instruction is
+  // processed.
+  void enterBundleUnlock() {
+    assert(isInBundleLockRegion());
+    SizeSnapshotPost = Asm->getBufferSize();
+  }
+  // Update bookkeeping when we are completely finished with the
+  // bundle_lock region.
+  void leaveBundleLockRegion() { BundleLockStart = End; }
+  // Check whether the instruction sequence fits within the current
+  // bundle, and if not, add nop padding to the end of the current
+  // bundle.
+  void padToNextBundle() {
+    assert(isInBundleLockRegion());
+    if (!isSameBundle()) {
+      intptr_t PadToNextBundle = BundleSize - getPreAlignment();
+      Asm->padWithNop(PadToNextBundle);
+      SizeSnapshotPre += PadToNextBundle;
+      SizeSnapshotPost += PadToNextBundle;
+      assert((Asm->getBufferSize() & BundleMaskLo) == 0);
+      assert(Asm->getBufferSize() == SizeSnapshotPre);
+    }
+  }
+  // If align_to_end is specified, add padding such that the
+  // instruction sequences ends precisely at a bundle boundary.
+  void padForAlignToEnd() {
+    assert(isInBundleLockRegion());
+    if (isAlignToEnd()) {
+      if (intptr_t Offset = getPostAlignment()) {
+        Asm->padWithNop(BundleSize - Offset);
+        SizeSnapshotPre = Asm->getBufferSize();
+      }
+    }
+  }
+  // Update bookkeeping when rolling back for the second pass.
+  void rollback() {
+    assert(isInBundleLockRegion());
+    Asm->setBufferSize(SizeSnapshotPre);
+    Asm->setPreliminary(false);
+  }
+private:
+  Assembler *const Asm;
+  // End is a sentinel value such that BundleLockStart==End implies
+  // that we are not in a bundle_lock region.
+  const InstList::const_iterator End;
+  InstList::const_iterator BundleLockStart;
+  const intptr_t BundleSize;
+  // Masking with BundleMaskLo identifies an address's bundle offset.
+  const intptr_t BundleMaskLo;
+  // Masking with BundleMaskHi identifies an address's bundle.
+  const intptr_t BundleMaskHi;
+  intptr_t SizeSnapshotPre;
+  intptr_t SizeSnapshotPost;
+};
+} // end of anonymous namespace
 void CfgNode::emitIAS(Cfg *Func) const {
  Func->setCurrentNode(this);
  Assembler *Asm = Func->getAssembler<>();
+  // TODO(stichnot): When sandboxing, defer binding the node label
+  // until just before the first instruction is emitted, to reduce the
+  // chance that a padding nop is a branch target.
  Asm->BindCfgNodeLabel(getIndex());
  for (const Inst &I : Phis) {
    if (I.isDeleted())
@@ -901,14 +1021,99 @@ void CfgNode::emitIAS(Cfg *Func) const {
    // Emitting a Phi instruction should cause an error.
    I.emitIAS(Func);
  }
-  for (const Inst &I : Insts) {
-    if (I.isDeleted())
+  // Do the simple emission if not sandboxed.
+  if (!Func->getContext()->getFlags().getUseSandboxing()) {
+    for (const Inst &I : Insts) {
+      if (!I.isDeleted() && !I.isRedundantAssign()) {
+        I.emitIAS(Func);
+        updateStats(Func, &I);
+      }
+    }
+    return;
+  }
+  // The remainder of the function handles emission with sandboxing.
+  // There are explicit bundle_lock regions delimited by bundle_lock
+  // and bundle_unlock instructions.  All other instructions are
+  // treated as an implicit one-instruction bundle_lock region.
+  // Emission is done twice for each bundle_lock region.  The first
+  // pass is a preliminary pass, after which we can figure out what
+  // nop padding is needed, then roll back, and make the final pass.
+  //
+  // Ideally, the first pass would be speculative and the second pass
+  // would only be done if nop padding were needed, but the structure
+  // of the integrated assembler makes it hard to roll back the state
+  // of label bindings, label links, and relocation fixups.  Instead,
+  // the first pass just disables all mutation of that state.
+  BundleEmitHelper Helper(Asm, Insts);
+  InstList::const_iterator End = Insts.end();
+  // Retrying indicates that we had to roll back to the bundle_lock
+  // instruction to apply padding before the bundle_lock sequence.
+  bool Retrying = false;
+  for (InstList::const_iterator I = Insts.begin(); I != End; ++I) {
+    if (I->isDeleted() || I->isRedundantAssign())
      continue;
-    if (I.isRedundantAssign())
+    if (llvm::isa<InstBundleLock>(I)) {
+      // Set up the initial bundle_lock state.  This should not happen
+      // while retrying, because the retry rolls back to the
+      // instruction following the bundle_lock instruction.
+      assert(!Retrying);
+      Helper.enterBundleLock(I);
      continue;
-    I.emitIAS(Func);
+    }
-    updateStats(Func, &I);
+    if (llvm::isa<InstBundleUnlock>(I)) {
+      Helper.enterBundleUnlock();
+      if (Retrying) {
+        // Make sure all instructions are in the same bundle.
+        assert(Helper.isSameBundle());
+        // If align_to_end is specified, make sure the next
+        // instruction begins the bundle.
+        assert(!Helper.isAlignToEnd() || Helper.getPostAlignment() == 0);
+        Helper.leaveBundleLockRegion();
+        Retrying = false;
+      } else {
+        // This is the first pass, so roll back for the retry pass.
+        Helper.rollback();
+        // Pad to the next bundle if the instruction sequence crossed
+        // a bundle boundary.
+        Helper.padToNextBundle();
+        // Insert additional padding to make AlignToEnd work.
+        Helper.padForAlignToEnd();
+        // Prepare for the retry pass after padding is done.
+        Retrying = true;
+        I = Helper.getBundleLockStart();
+      }
+      continue;
+    }
+    // I points to a non bundle_lock/bundle_unlock instruction.
+    if (Helper.isInBundleLockRegion()) {
+      I->emitIAS(Func);
+      // Only update stats during the final pass.
+      if (Retrying)
+        updateStats(Func, I);
+    } else {
+      // Treat it as though there were an implicit bundle_lock and
+      // bundle_unlock wrapping the instruction.
+      Helper.enterBundleLock(I);
+      I->emitIAS(Func);
+      Helper.enterBundleUnlock();
+      Helper.rollback();
+      Helper.padToNextBundle();
+      I->emitIAS(Func);
+      updateStats(Func, I);
+      Helper.leaveBundleLockRegion();
+    }
  }
+  // Don't allow bundle locking across basic blocks, to keep the
+  // backtracking mechanism simple.
+  assert(!Helper.isInBundleLockRegion());
+  assert(!Retrying);
 }
 void CfgNode::dump(Cfg *Func) const {

--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -436,6 +436,13 @@ bool InstSwitch::repointEdge(CfgNode *OldNode, CfgNode *NewNode) {
 InstUnreachable::InstUnreachable(Cfg *Func)
    : InstHighLevel(Func, Inst::Unreachable, 0, nullptr) {}
+InstBundleLock::InstBundleLock(Cfg *Func, InstBundleLock::Option BundleOption)
+    : InstHighLevel(Func, Inst::BundleLock, 0, nullptr),
+      BundleOption(BundleOption) {}
+InstBundleUnlock::InstBundleUnlock(Cfg *Func)
+    : InstHighLevel(Func, Inst::BundleUnlock, 0, nullptr) {}
 InstFakeDef::InstFakeDef(Cfg *Func, Variable *Dest, Variable *Src)
    : InstHighLevel(Func, Inst::FakeDef, Src ? 1 : 0, Dest) {
  assert(Dest);
@@ -774,6 +781,48 @@ void InstUnreachable::dump(const Cfg *Func) const {
  Str << "unreachable";
 }
+void InstBundleLock::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t.bundle_lock";
+  switch (BundleOption) {
+  case Opt_None:
+    break;
+  case Opt_AlignToEnd:
+    Str << "\talign_to_end";
+    break;
+  }
+}
+void InstBundleLock::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "bundle_lock";
+  switch (BundleOption) {
+  case Opt_None:
+    break;
+  case Opt_AlignToEnd:
+    Str << " align_to_end";
+    break;
+  }
+}
+void InstBundleUnlock::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t.bundle_unlock";
+}
+void InstBundleUnlock::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "bundle_unlock";
+}
 void InstFakeDef::emit(const Cfg *Func) const {
  if (!ALLOW_DUMP)
    return;

--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -59,11 +59,13 @@ public:
    Select,
    Store,
    Switch,
-    FakeDef,  // not part of LLVM/PNaCl bitcode
+    BundleLock,   // not part of LLVM/PNaCl bitcode
-    FakeUse,  // not part of LLVM/PNaCl bitcode
+    BundleUnlock, // not part of LLVM/PNaCl bitcode
-    FakeKill, // not part of LLVM/PNaCl bitcode
+    FakeDef,      // not part of LLVM/PNaCl bitcode
-    Target    // target-specific low-level ICE
+    FakeUse,      // not part of LLVM/PNaCl bitcode
-              // Anything >= Target is an InstTarget subclass.
+    FakeKill,     // not part of LLVM/PNaCl bitcode
+    Target        // target-specific low-level ICE
+                  // Anything >= Target is an InstTarget subclass.
  };
  InstKind getKind() const { return Kind; }
@@ -743,6 +745,53 @@ private:
  ~InstUnreachable() override {}
 };
+// BundleLock instruction.  There are no operands.  Contains an option
+// indicating whether align_to_end is specified.
+class InstBundleLock : public InstHighLevel {
+  InstBundleLock(const InstBundleLock &) = delete;
+  InstBundleLock &operator=(const InstBundleLock &) = delete;
+public:
+  enum Option { Opt_None, Opt_AlignToEnd };
+  static InstBundleLock *create(Cfg *Func, Option BundleOption) {
+    return new (Func->allocate<InstBundleLock>())
+        InstBundleLock(Func, BundleOption);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg * /* Func */) const override {}
+  void dump(const Cfg *Func) const override;
+  Option getOption() const { return BundleOption; }
+  static bool classof(const Inst *Inst) {
+    return Inst->getKind() == BundleLock;
+  }
+private:
+  Option BundleOption;
+  InstBundleLock(Cfg *Func, Option BundleOption);
+  ~InstBundleLock() override {}
+};
+// BundleUnlock instruction.  There are no operands.
+class InstBundleUnlock : public InstHighLevel {
+  InstBundleUnlock(const InstBundleUnlock &) = delete;
+  InstBundleUnlock &operator=(const InstBundleUnlock &) = delete;
+public:
+  static InstBundleUnlock *create(Cfg *Func) {
+    return new (Func->allocate<InstBundleUnlock>()) InstBundleUnlock(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg * /* Func */) const override {}
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) {
+    return Inst->getKind() == BundleUnlock;
+  }
+private:
+  explicit InstBundleUnlock(Cfg *Func);
+  ~InstBundleUnlock() override {}
+};
 // FakeDef instruction.  This creates a fake definition of a variable,
 // which is how we represent the case when an instruction produces
 // multiple results.  This doesn't happen with high-level ICE

--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -200,6 +200,11 @@ bool InstX8632Br::repointEdge(CfgNode *OldNode, CfgNode *NewNode) {
  return false;
 }
+InstX8632Jmp::InstX8632Jmp(Cfg *Func, Operand *Target)
+    : InstX8632(Func, InstX8632::Jmp, 1, nullptr) {
+  addSource(Target);
+}
 InstX8632Call::InstX8632Call(Cfg *Func, Variable *Dest, Operand *CallTarget)
    : InstX8632(Func, InstX8632::Call, 1, Dest) {
  HasSideEffects = true;
@@ -450,6 +455,59 @@ void InstX8632Br::dump(const Cfg *Func) const {
  }
 }
+void InstX8632Jmp::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  Str << "\tjmp\t*";
+  getJmpTarget()->emit(Func);
+}
+void InstX8632Jmp::emitIAS(const Cfg *Func) const {
+  // Note: Adapted (mostly copied) from InstX8632Call::emitIAS().
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  Operand *Target = getJmpTarget();
+  if (const auto Var = llvm::dyn_cast<Variable>(Target)) {
+    if (Var->hasReg()) {
+      Asm->jmp(RegX8632::getEncodedGPR(Var->getRegNum()));
+    } else {
+      // The jmp instruction with a memory operand should be possible
+      // to encode, but it isn't a valid sandboxed instruction, and
+      // there shouldn't be a register allocation issue to jump
+      // through a scratch register, so we don't really need to bother
+      // implementing it.
+      llvm::report_fatal_error("Assembler can't jmp to memory operand");
+    }
+  } else if (const auto Mem = llvm::dyn_cast<OperandX8632Mem>(Target)) {
+    assert(Mem->getSegmentRegister() == OperandX8632Mem::DefaultSegment);
+    llvm::report_fatal_error("Assembler can't jmp to memory operand");
+  } else if (const auto CR = llvm::dyn_cast<ConstantRelocatable>(Target)) {
+    assert(CR->getOffset() == 0 && "We only support jumping to a function");
+    Asm->jmp(CR);
+  } else if (const auto Imm = llvm::dyn_cast<ConstantInteger32>(Target)) {
+    // NaCl trampoline calls refer to an address within the sandbox directly.
+    // This is usually only needed for non-IRT builds and otherwise not
+    // very portable or stable. For this, we would use the 0xE8 opcode
+    // (relative version of call) and there should be a PC32 reloc too.
+    // The PC32 reloc will have symbol index 0, and the absolute address
+    // would be encoded as an offset relative to the next instruction.
+    // TODO(jvoung): Do we need to support this?
+    (void)Imm;
+    llvm::report_fatal_error("Unexpected jmp to absolute address");
+  } else {
+    llvm::report_fatal_error("Unexpected operand type");
+  }
+}
+void InstX8632Jmp::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "jmp ";
+  getJmpTarget()->dump(Func);
+}
 void InstX8632Call::emit(const Cfg *Func) const {
  if (!ALLOW_DUMP)
    return;
@@ -2729,7 +2787,7 @@ void OperandX8632Mem::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  if (SegmentReg != DefaultSegment) {
    assert(SegmentReg >= 0 && SegmentReg < SegReg_NUM);
-    Str << InstX8632SegmentRegNames[SegmentReg] << ":";
+    Str << "%" << InstX8632SegmentRegNames[SegmentReg] << ":";
  }
  // Emit as Offset(Base,Index,1<<Shift).
  // Offset is emitted without the leading '$'.
@@ -2737,7 +2795,7 @@ void OperandX8632Mem::emit(const Cfg *Func) const {
  if (!Offset) {
    // No offset, emit nothing.
  } else if (const auto CI = llvm::dyn_cast<ConstantInteger32>(Offset)) {
-    if (CI->getValue())
+    if (Base == nullptr || CI->getValue())
      // Emit a non-zero offset without a leading '$'.
      Str << CI->getValue();
  } else if (const auto CR = llvm::dyn_cast<ConstantRelocatable>(Offset)) {

--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -196,6 +196,7 @@ public:
    Idiv,
    Imul,
    Insertps,
+    Jmp,
    Label,
    Lea,
    Load,
@@ -400,6 +401,27 @@ private:
  const InstX8632Label *Label; // Intra-block branch target
 };
+// Jump to a target outside this function, such as tailcall, nacljump,
+// naclret, unreachable.  This is different from a Branch instruction
+// in that there is no intra-function control flow to represent.
+class InstX8632Jmp : public InstX8632 {
+  InstX8632Jmp(const InstX8632Jmp &) = delete;
+  InstX8632Jmp &operator=(const InstX8632Jmp &) = delete;
+public:
+  static InstX8632Jmp *create(Cfg *Func, Operand *Target) {
+    return new (Func->allocate<InstX8632Jmp>()) InstX8632Jmp(Func, Target);
+  }
+  Operand *getJmpTarget() const { return getSrc(0); }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Jmp); }
+private:
+  InstX8632Jmp(Cfg *Func, Operand *Target);
+};
 // AdjustStack instruction - subtracts esp by the given amount and
 // updates the stack offset during code emission.
 class InstX8632AdjustStack : public InstX8632 {

--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -218,6 +218,8 @@ void TargetLowering::lower() {
  case Inst::Unreachable:
    lowerUnreachable(llvm::dyn_cast<InstUnreachable>(Inst));
    break;
+  case Inst::BundleLock:
+  case Inst::BundleUnlock:
  case Inst::FakeDef:
  case Inst::FakeUse:
  case Inst::FakeKill:

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -963,6 +963,29 @@ void TargetX8632::addEpilog(CfgNode *Node) {
      _pop(getPhysicalRegister(j));
    }
  }
+  if (!Ctx->getFlags().getUseSandboxing())
+    return;
+  // Change the original ret instruction into a sandboxed return sequence.
+  // t:ecx = pop
+  // bundle_lock
+  // and t, ~31
+  // jmp *t
+  // bundle_unlock
+  // FakeUse <original_ret_operand>
+  const SizeT BundleSize = 1
+                           << Func->getAssembler<>()->getBundleAlignLog2Bytes();
+  Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
+  _pop(T_ecx);
+  _bundle_lock();
+  _and(T_ecx, Ctx->getConstantInt32(~(BundleSize - 1)));
+  _jmp(T_ecx);
+  _bundle_unlock();
+  if (RI->getSrcSize()) {
+    Variable *RetValue = llvm::cast<Variable>(RI->getSrc(0));
+    Context.insert(InstFakeUse::create(Func, RetValue));
+  }
+  RI->setDeleted();
 }
 void TargetX8632::split64(Variable *Var) {
@@ -1815,8 +1838,24 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
    }
  }
  Operand *CallTarget = legalize(Instr->getCallTarget());
+  const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
+  if (NeedSandboxing) {
+    if (llvm::isa<Constant>(CallTarget)) {
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+    } else {
+      Variable *CallTargetVar = nullptr;
+      _mov(CallTargetVar, CallTarget);
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+      const SizeT BundleSize =
+          1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
+      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
+      CallTarget = CallTargetVar;
+    }
+  }
  Inst *NewCall = InstX8632Call::create(Func, ReturnReg, CallTarget);
  Context.insert(NewCall);
+  if (NeedSandboxing)
+    _bundle_unlock();
  if (ReturnRegHi)
    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
@@ -3829,6 +3868,9 @@ void TargetX8632::lowerRet(const InstRet *Inst) {
      _mov(Reg, Src0, RegX8632::Reg_eax);
    }
  }
+  // Add a ret instruction even if sandboxing is enabled, because
+  // addEpilog explicitly looks for a ret instruction as a marker for
+  // where to insert the frame removal instructions.
  _ret(Reg);
  // Add a fake use of esp to make sure esp stays alive for the entire
  // function.  Otherwise post-call esp adjustments get dead-code

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -230,6 +230,11 @@ protected:
  void _bswap(Variable *SrcDest) {
    Context.insert(InstX8632Bswap::create(Func, SrcDest));
  }
+  void
+  _bundle_lock(InstBundleLock::Option BundleOption = InstBundleLock::Opt_None) {
+    Context.insert(InstBundleLock::create(Func, BundleOption));
+  }
+  void _bundle_unlock() { Context.insert(InstBundleUnlock::create(Func)); }
  void _cbwdq(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Cbwdq::create(Func, Dest, Src0));
  }
@@ -286,6 +291,9 @@ protected:
  void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
    Context.insert(InstX8632Insertps::create(Func, Dest, Src0, Src1));
  }
+  void _jmp(Operand *Target) {
+    Context.insert(InstX8632Jmp::create(Func, Target));
+  }
  void _lea(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Lea::create(Func, Dest, Src0));
  }

--- a/src/assembler.cpp
+++ b/src/assembler.cpp
@@ -40,7 +40,8 @@ AssemblerFixup *AssemblerBuffer::createFixup(FixupKind Kind,
  F->set_position(0);
  F->set_kind(Kind);
  F->set_value(Value);
-  fixups_.push_back(F);
+  if (!assembler_.getPreliminary())
+    fixups_.push_back(F);
  return F;
 }

--- a/src/assembler.h
+++ b/src/assembler.h
@@ -115,6 +115,11 @@ public:
  const FixupRefList &fixups() const { return fixups_; }
+  void setSize(intptr_t NewSize) {
+    assert(NewSize <= Size());
+    cursor_ = contents_ + NewSize;
+  }
 private:
  // The limit is set to kMinimumGap bytes before the end of the data area.
  // This leaves enough space for the longest possible instruction and allows
@@ -149,7 +154,9 @@ class Assembler {
  Assembler &operator=(const Assembler &) = delete;
 public:
-  Assembler() : FunctionName(""), IsInternal(false), buffer_(*this) {}
+  Assembler()
+      : FunctionName(""), IsInternal(false), Preliminary(false),
+        buffer_(*this) {}
  virtual ~Assembler() {}
  // Allocate a chunk of bytes using the per-Assembler allocator.
@@ -170,6 +177,9 @@ public:
  // Align the tail end of the function to the required target alignment.
  virtual void alignFunction() = 0;
+  // Add nop padding of a particular width to the current bundle.
+  virtual void padWithNop(intptr_t Padding) = 0;
  virtual SizeT getBundleAlignLog2Bytes() const = 0;
  virtual llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const = 0;
@@ -194,6 +204,11 @@ public:
  void setInternal(bool Internal) { IsInternal = Internal; }
  const IceString &getFunctionName() { return FunctionName; }
  void setFunctionName(const IceString &NewName) { FunctionName = NewName; }
+  intptr_t getBufferSize() const { return buffer_.Size(); }
+  // Roll back to a (smaller) size.
+  void setBufferSize(intptr_t NewSize) { buffer_.setSize(NewSize); }
+  void setPreliminary(bool Value) { Preliminary = Value; }
+  bool getPreliminary() const { return Preliminary; }
 private:
  ArenaAllocator<32 * 1024> Allocator;
@@ -202,6 +217,11 @@ private:
  // assembler buffer is emitted.
  IceString FunctionName;
  bool IsInternal;
+  // Preliminary indicates whether a preliminary pass is being made
+  // for calculating bundle padding (Preliminary=true), versus the
+  // final pass where all changes to label bindings, label links, and
+  // relocation fixups are fully committed (Preliminary=false).
+  bool Preliminary;
 protected:
  AssemblerBuffer buffer_;

--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -79,13 +79,15 @@ Label *AssemblerX86::GetOrCreateLocalLabel(SizeT Number) {
 }
 void AssemblerX86::BindCfgNodeLabel(SizeT NodeNumber) {
+  assert(!getPreliminary());
  Label *L = GetOrCreateCfgNodeLabel(NodeNumber);
  this->Bind(L);
 }
 void AssemblerX86::BindLocalLabel(SizeT Number) {
  Label *L = GetOrCreateLocalLabel(Number);
-  this->Bind(L);
+  if (!getPreliminary())
+    this->Bind(L);
 }
 void AssemblerX86::call(GPRRegister reg) {
@@ -2229,6 +2231,13 @@ void AssemblerX86::j(CondX86::BrCond condition, Label *label, bool near) {
    intptr_t offset = label->Position() - buffer_.Size();
    assert(offset <= 0);
    if (Utils::IsInt(8, offset - kShortSize)) {
+      // TODO(stichnot): Here and in jmp(), we may need to be more
+      // conservative about the backward branch distance if the branch
+      // instruction is within a bundle_lock sequence, because the
+      // distance may increase when padding is added.  This isn't an
+      // issue for branches outside a bundle_lock, because if padding
+      // is added, the retry may change it to a long backward branch
+      // without affecting any of the bookkeeping.
      EmitUint8(0x70 + condition);
      EmitUint8((offset - kShortSize) & 0xFF);
    } else {
@@ -2463,14 +2472,16 @@ void AssemblerX86::EmitLabelLink(Label *label) {
  assert(!label->IsBound());
  intptr_t position = buffer_.Size();
  EmitInt32(label->position_);
-  label->LinkTo(position);
+  if (!getPreliminary())
+    label->LinkTo(position);
 }
 void AssemblerX86::EmitNearLabelLink(Label *label) {
  assert(!label->IsBound());
  intptr_t position = buffer_.Size();
  EmitUint8(0);
-  label->NearLinkTo(position);
+  if (!getPreliminary())
+    label->NearLinkTo(position);
 }
 void AssemblerX86::EmitGenericShift(int rm, Type Ty, GPRRegister reg,

--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -325,6 +325,8 @@ private:
  intptr_t position_;
  intptr_t num_unresolved_;
+  // TODO(stichnot,jvoung): Can this instead be
+  // llvm::SmallVector<intptr_t, kMaxUnresolvedBranches> ?
  intptr_t unresolved_near_positions_[kMaxUnresolvedBranches];
  friend class AssemblerX86;
@@ -354,6 +356,15 @@ public:
    return llvm::ArrayRef<uint8_t>(Padding, 1);
  }
+  void padWithNop(intptr_t Padding) override {
+    while (Padding > MAX_NOP_SIZE) {
+      nop(MAX_NOP_SIZE);
+      Padding -= MAX_NOP_SIZE;
+    }
+    if (Padding)
+      nop(Padding);
+  }
  Label *GetOrCreateCfgNodeLabel(SizeT NodeNumber);
  void BindCfgNodeLabel(SizeT NodeNumber) override;
  Label *GetOrCreateLocalLabel(SizeT Number);

--- a/tests_lit/assembler/x86/sandboxing.ll
+++ b/tests_lit/assembler/x86/sandboxing.ll
+; Tests basics and corner cases of x86-32 sandboxing, using -Om1 in
+; the hope that the output will remain stable.  When packing bundles,
+; we try to limit to a few instructions with well known sizes and
+; minimal use of registers and stack slots in the lowering sequence.
+; RUN: %p2i -i %s --assemble --disassemble --args -Om1 --verbose none \
+; RUN:   -ffunction-sections -sandbox | FileCheck %s
+declare void @call_target()
+@global_byte = internal global [1 x i8] zeroinitializer
+@global_short = internal global [2 x i8] zeroinitializer
+; A direct call sequence uses the right mask and register-call sequence.
+define void @test_direct_call() {
+entry:
+  call void @call_target()
+  ret void
+}
+; CHECK-LABEL: test_direct_call
+; CHECK: nop
+; CHECK: 1b: {{.*}} call 1c
+; CHECK-NEXT: 20:
+; An indirect call sequence uses the right mask and register-call sequence.
+define void @test_indirect_call(i32 %target) {
+entry:
+  %__1 = inttoptr i32 %target to void ()*
+  call void %__1()
+  ret void
+}
+; CHECK-LABEL: test_indirect_call
+; CHECK: mov [[REG:.*]],DWORD PTR [esp
+; CHECK-NEXT: nop
+; CHECK: 1b: {{.*}} and [[REG]],0xffffffe0
+; CHECK-NEXT: call [[REG]]
+; CHECk-NEXT: 20:
+; A return sequences uses the right pop / mask / jmp sequence.
+define void @test_ret() {
+entry:
+  ret void
+}
+; CHECK-LABEL: test_ret
+; CHECK: pop ecx
+; CHECK-NEXT: and ecx,0xffffffe0
+; CHECK-NEXT: jmp ecx
+; A perfectly packed bundle should not have nops at the end.
+define void @packed_bundle() {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  %addr_byte = bitcast [1 x i8]* @global_byte to i8*
+  %addr_short = bitcast [2 x i8]* @global_short to i16*
+  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  ; bundle boundary
+  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  ret void
+}
+; CHECK-LABEL: packed_bundle
+; CHECK: call
+; CHECK-NEXT: 20: {{.*}} mov BYTE PTR
+; CHECK-NEXT: 27: {{.*}} mov WORD PTR
+; CHECK-NEXT: 30: {{.*}} mov BYTE PTR
+; CHECK-NEXT: 37: {{.*}} mov WORD PTR
+; CHECK-NEXT: 40: {{.*}} mov BYTE PTR
+; CHECK-NEXT: 47: {{.*}} mov WORD PTR
+; An imperfectly packed bundle should have one or more nops at the end.
+define void @nonpacked_bundle() {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  %addr_short = bitcast [2 x i8]* @global_short to i16*
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  ; nop padding
+  ; bundle boundary
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  ret void
+}
+; CHECK-LABEL: nonpacked_bundle
+; CHECK: call
+; CHECK-NEXT: 20: {{.*}} mov WORD PTR
+; CHECK-NEXT: 29: {{.*}} mov WORD PTR
+; CHECK-NEXT: 32: {{.*}} mov WORD PTR
+; CHECK-NEXT: 3b: {{.*}} nop
+; CHECK: 40: {{.*}} mov WORD PTR
+; A zero-byte instruction (e.g. local label definition) at a bundle
+; boundary should not trigger nop padding.
+define void @label_at_boundary(i32 %arg) {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  %addr_short = bitcast [2 x i8]* @global_short to i16*
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  %cmp = icmp eq i32 %arg, 0               ; 23-byte lowering sequence
+  ; label is here
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  ret void
+}
+; CHECK-LABEL: label_at_boundary
+; CHECK: call
+; We rely on the hideous 4-instruction 23-byte Om1 lowering sequence for icmp.
+; CHECK-NEXT: 20: {{.*}} mov WORD PTR
+; CHECK-NEXT: 29: {{.*}} cmp DWORD PTR
+; CHECK-NEXT: 2e: {{.*}} mov DWORD PTR
+; CHECK-NEXT: 36: {{.*}} je 40
+; CHECK-NEXT: 38: {{.*}} mov DWORD PTR
+; CHECK-NEXT: 40: {{.*}} mov WORD PTR
+; Bundle lock without padding.
+define void @bundle_lock_without_padding() {
+entry:
+  %addr_short = bitcast [2 x i8]* @global_short to i16*
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  ret void
+}
+; CHECK-LABEL: bundle_lock_without_padding
+; CHECK: mov WORD PTR
+; CHECK-NEXT: pop ecx
+; CHECK-NEXT: and ecx,0xffffffe0
+; CHECK-NEXT: jmp ecx
+; Bundle lock with padding.
+define void @bundle_lock_with_padding() {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  %addr_byte = bitcast [1 x i8]* @global_byte to i8*
+  %addr_short = bitcast [2 x i8]* @global_short to i16*
+  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  ret void
+  ; 3 bytes to restore stack pointer
+  ; 1 byte to pop ecx
+  ; bundle_lock
+  ; 3 bytes to mask ecx
+  ; This is now 32 bytes from the beginning of the bundle, so
+  ; a 3-byte nop will need to be emitted before the bundle_lock.
+  ; 2 bytes to jump to ecx
+  ; bundle_unlock
+}
+; CHECK-LABEL: bundle_lock_with_padding
+; CHECK: call
+; CHECK-NEXT: 20: {{.*}} mov BYTE PTR
+; CHECK-NEXT: 27: {{.*}} mov WORD PTR
+; CHECK-NEXT: 30: {{.*}} mov WORD PTR
+; CHECK-NEXT: 39: {{.*}} add esp,
+; CHECK-NEXT: 3c: {{.*}} pop ecx
+; CHECK-NEXT: 3d: {{.*}} nop
+; CHECK-NEXT: 40: {{.*}} and ecx,0xffffffe0
+; CHECK-NEXT: 43: {{.*}} jmp ecx
+; Bundle lock align_to_end without any padding.
+define void @bundle_lock_align_to_end_padding_0() {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  %addr_short = bitcast [2 x i8]* @global_short to i16*
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  call void @call_target()                 ; 5-byte instruction
+  ret void
+}
+; CHECK-LABEL: bundle_lock_align_to_end_padding_0
+; CHECK: call
+; CHECK-NEXT: 20: {{.*}} mov WORD PTR
+; CHECK-NEXT: 29: {{.*}} mov WORD PTR
+; CHECK-NEXT: 32: {{.*}} mov WORD PTR
+; CHECK-NEXT: 3b: {{.*}} call
+; Bundle lock align_to_end with one bunch of padding.
+define void @bundle_lock_align_to_end_padding_1() {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  %addr_byte = bitcast [1 x i8]* @global_byte to i8*
+  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
+  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
+  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
+  call void @call_target()                 ; 5-byte instruction
+  ret void
+}
+; CHECK-LABEL: bundle_lock_align_to_end_padding_1
+; CHECK: call
+; CHECK-NEXT: 20: {{.*}} mov BYTE PTR
+; CHECK-NEXT: 27: {{.*}} mov BYTE PTR
+; CHECK-NEXT: 2e: {{.*}} mov BYTE PTR
+; CHECK-NEXT: 35: {{.*}} nop
+; CHECK: 3b: {{.*}} call
+; Bundle lock align_to_end with two bunches of padding.
+define void @bundle_lock_align_to_end_padding_2(i32 %target) {
+entry:
+  call void @call_target()
+  ; bundle boundary
+  %addr_byte = bitcast [1 x i8]* @global_byte to i8*
+  %addr_short = bitcast [2 x i8]* @global_short to i16*
+  %__1 = inttoptr i32 %target to void ()*
+  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
+  call void %__1()
+  ; 4 bytes to load %target into a register
+  ; bundle_lock align_to_end
+  ; 3 bytes to mask the register
+  ; This is now 32 bytes from the beginning of the bundle, so
+  ; a 3-byte nop will need to be emitted before the bundle_lock,
+  ; followed by a 27-byte nop before the mask/jump.
+  ; 2 bytes to jump to the register
+  ; bundle_unlock
+  ret void
+}
+; CHECK-LABEL: bundle_lock_align_to_end_padding_2
+; CHECK: call
+; CHECK-NEXT: 20: {{.*}} mov BYTE PTR
+; CHECK-NEXT: 27: {{.*}} mov WORD PTR
+; CHECK-NEXT: 30: {{.*}} mov WORD PTR
+; CHECK-NEXT: 39: {{.*}} mov [[REG:.*]],DWORD PTR [esp
+; CHECK-NEXT: 3d: {{.*}} nop
+; CHECK: 40: {{.*}} nop
+; CHECK: 5b: {{.*}} and [[REG]],0xffffffe0
+; CHECK-NEXT: 5e: {{.*}} call [[REG]]