Combine allocas

Partition allocas that occur in the entry block into two categories. The first is those whose size is fixed and alignment are less than or equal to the stack alignment. These are emitted relative to a pointer, either in increasing offset relative to the stack pointer or decreasing offset relative to the frame pointer. (Actually, we are not enabling this optimization for frame pointer frames yet) The second category is allocas whose size is dynamic or alignment is creater than the stack alignment. These are emitted relative to a user variable in increasing offset order. This optimization is only enabled for x86 at O2. BUG= R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1411583007 .

Combine allocas
4318a410 · David Sehr · 5ff0cfb4 · 4318a410 · 4318a410 · 4318a410
Commit 4318a410 authored Nov 11, 2015 by David Sehr
16 changed files
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -185,10 +185,19 @@ public:
  void advancedPhiLowering();
  void reorderNodes();
  void shuffleNodes();
-  void sortAllocas(CfgVector<Inst *> &Allocas, InstList &Insts,
-                   bool IsKnownFrameOffset);
-  /// Merge all the fixed-size allocas in the entry block.
-  void processAllocas();
+
+  enum AllocaBaseVariableType {
+    BVT_StackPointer,
+    BVT_FramePointer,
+    BVT_UserPointer
+  };
+  void sortAndCombineAllocas(CfgVector<Inst *> &Allocas,
+                             uint32_t CombinedAlignment, InstList &Insts,
+                             AllocaBaseVariableType BaseVariableType);
+  /// Scan allocas to determine whether we need to use a frame pointer.
+  /// If SortAndCombine == true, merge all the fixed-size allocas in the
+  /// entry block and emit stack or frame pointer-relative addressing.
+  void processAllocas(bool SortAndCombine);
  void doAddressOpt();
  void doArgLowering();
  void doNopInsertion();

--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -101,6 +101,20 @@ MachineTraits<TargetX8632>::X86OperandMem::X86OperandMem(
 void MachineTraits<TargetX8632>::X86OperandMem::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;
+  const ::Ice::TargetLowering *Target = Func->getTarget();
+  // If the base is rematerializable, we need to replace it with the correct
+  // physical register (esp or ebp), and update the Offset.
+  int32_t Disp = 0;
+  if (getBase() && getBase()->isRematerializable()) {
+    Disp += getBase()->getStackOffset();
+    if (!getIgnoreStackAdjust())
+      Disp += Target->getStackAdjustment();
+  }
+  // The index should never be rematerializable.  But if we ever allow it, then
+  // we should make sure the rematerialization offset is shifted by the Shift
+  // value.
+  if (getIndex())
+    assert(!getIndex()->isRematerializable());
  Ostream &Str = Func->getContext()->getStrEmit();
  if (SegmentReg != DefaultSegment) {
    assert(SegmentReg >= 0 && SegmentReg < SegReg_NUM);
@@ -108,27 +122,33 @@ void MachineTraits<TargetX8632>::X86OperandMem::emit(const Cfg *Func) const {
  }
  // Emit as Offset(Base,Index,1<<Shift). Offset is emitted without the leading
  // '$'. Omit the (Base,Index,1<<Shift) part if Base==nullptr.
-  if (!Offset) {
+  if (getOffset() == 0 && Disp == 0) {
    // No offset, emit nothing.
-  } else if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(Offset)) {
-    if (Base == nullptr || CI->getValue())
+  } else if (getOffset() == 0 && Disp != 0) {
+    Str << Disp;
+  } else if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(getOffset())) {
+    if (getBase() == nullptr || CI->getValue() || Disp != 0)
      // Emit a non-zero offset without a leading '$'.
-      Str << CI->getValue();
-  } else if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
+      Str << CI->getValue() + Disp;
+  } else if (const auto *CR =
+                 llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
+    // TODO(sehr): ConstantRelocatable still needs updating for
+    // rematerializable base/index and Disp.
+    assert(Disp == 0);
    CR->emitWithoutPrefix(Func->getTarget());
  } else {
    llvm_unreachable("Invalid offset type for x86 mem operand");
  }

-  if (Base || Index) {
+  if (getBase() || getIndex()) {
    Str << "(";
-    if (Base)
-      Base->emit(Func);
-    if (Index) {
+    if (getBase())
+      getBase()->emit(Func);
+    if (getIndex()) {
      Str << ",";
-      Index->emit(Func);
-      if (Shift)
-        Str << "," << (1u << Shift);
+      getIndex()->emit(Func);
+      if (getShift())
+        Str << "," << (1u << getShift());
    }
    Str << ")";
  }
@@ -144,44 +164,54 @@ void MachineTraits<TargetX8632>::X86OperandMem::dump(const Cfg *Func,
  }
  bool Dumped = false;
  Str << "[";
-  if (Base) {
+  int32_t Disp = 0;
+  if (getBase() && getBase()->isRematerializable()) {
+    Disp += getBase()->getStackOffset();
+    if (!getIgnoreStackAdjust())
+      Disp += Func->getTarget()->getStackAdjustment();
+  }
+  if (getBase()) {
    if (Func)
-      Base->dump(Func);
+      getBase()->dump(Func);
    else
-      Base->dump(Str);
+      getBase()->dump(Str);
    Dumped = true;
  }
-  if (Index) {
-    if (Base)
+  if (getIndex()) {
+    assert(!getIndex()->isRematerializable());
+    if (getBase())
      Str << "+";
-    if (Shift > 0)
-      Str << (1u << Shift) << "*";
+    if (getShift() > 0)
+      Str << (1u << getShift()) << "*";
    if (Func)
-      Index->dump(Func);
+      getIndex()->dump(Func);
    else
-      Index->dump(Str);
+      getIndex()->dump(Str);
    Dumped = true;
  }
  // Pretty-print the Offset.
  bool OffsetIsZero = false;
  bool OffsetIsNegative = false;
-  if (!Offset) {
+  if (getOffset() == 0 && Disp == 0) {
    OffsetIsZero = true;
-  } else if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(Offset)) {
-    OffsetIsZero = (CI->getValue() == 0);
-    OffsetIsNegative = (static_cast<int32_t>(CI->getValue()) < 0);
+  } else if (getOffset() == 0 && Disp != 0) {
+    OffsetIsZero = (Disp == 0);
+    OffsetIsNegative = (Disp < 0);
+  } else if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(getOffset())) {
+    OffsetIsZero = (CI->getValue() + Disp == 0);
+    OffsetIsNegative = (static_cast<int32_t>(CI->getValue()) + Disp < 0);
  } else {
-    assert(llvm::isa<ConstantRelocatable>(Offset));
+    assert(llvm::isa<ConstantRelocatable>(getOffset()) && Disp == 0);
  }
  if (Dumped) {
    if (!OffsetIsZero) {     // Suppress if Offset is known to be 0
      if (!OffsetIsNegative) // Suppress if Offset is known to be negative
        Str << "+";
-      Offset->dump(Func, Str);
+      getOffset()->dump(Func, Str);
    }
  } else {
    // There is only the offset.
-    Offset->dump(Func, Str);
+    getOffset()->dump(Func, Str);
  }
  Str << "]";
 }
@@ -196,16 +226,28 @@ void MachineTraits<TargetX8632>::X86OperandMem::emitSegmentOverride(

 MachineTraits<TargetX8632>::Address
 MachineTraits<TargetX8632>::X86OperandMem::toAsmAddress(
-    MachineTraits<TargetX8632>::Assembler *Asm) const {
+    MachineTraits<TargetX8632>::Assembler *Asm,
+    const Ice::TargetLowering *Target) const {
  int32_t Disp = 0;
+  if (getBase() && getBase()->isRematerializable()) {
+    Disp += getBase()->getStackOffset();
+    if (!getIgnoreStackAdjust()) {
+      Disp += Target->getStackAdjustment();
+    }
+  }
+  // The index should never be rematerializable.  But if we ever allow it, then
+  // we should make sure the rematerialization offset is shifted by the Shift
+  // value.
+  if (getIndex())
+    assert(!getIndex()->isRematerializable());
  AssemblerFixup *Fixup = nullptr;
  // Determine the offset (is it relocatable?)
  if (getOffset()) {
    if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(getOffset())) {
-      Disp = static_cast<int32_t>(CI->getValue());
+      Disp += static_cast<int32_t>(CI->getValue());
    } else if (const auto CR =
                   llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
-      Disp = CR->getOffset();
+      Disp += CR->getOffset();
      Fixup = Asm->createFixup(RelFixup, CR);
    } else {
      llvm_unreachable("Unexpected offset type");

--- a/src/IceInstX8664.cpp
+++ b/src/IceInstX8664.cpp
@@ -170,7 +170,14 @@ void MachineTraits<TargetX8664>::X86OperandMem::dump(const Cfg *Func,

 MachineTraits<TargetX8664>::Address
 MachineTraits<TargetX8664>::X86OperandMem::toAsmAddress(
-    MachineTraits<TargetX8664>::Assembler *Asm) const {
+    MachineTraits<TargetX8664>::Assembler *Asm,
+    const Ice::TargetLowering *Target) const {
+  // TODO(sehr): handle rematerializable base/index.
+  (void)Target;
+  if (getBase())
+    assert(!getBase()->isRematerializable());
+  if (getIndex())
+    assert(!getIndex()->isRematerializable());
  int32_t Disp = 0;
  AssemblerFixup *Fixup = nullptr;
  // Determine the offset (is it relocatable?)

--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -509,6 +509,13 @@ public:
  bool mustNotHaveReg() const {
    return RegRequirement == RR_MustNotHaveRegister;
  }
+  void setRematerializable(int32_t NewRegNum, int32_t NewOffset) {
+    IsRematerializable = true;
+    setRegNum(NewRegNum);
+    setStackOffset(NewOffset);
+    setMustHaveReg();
+  }
+  bool isRematerializable() const { return IsRematerializable; }

  void setRegClass(uint8_t RC) { RegisterClass = static_cast<RegClass>(RC); }
  RegClass getRegClass() const { return RegisterClass; }
@@ -573,6 +580,9 @@ protected:
  /// and validating live ranges. This is usually reserved for the stack
  /// pointer and other physical registers specifically referenced by name.
  bool IgnoreLiveness = false;
+  // If IsRematerializable, RegNum keeps track of which register (stack or frame
+  // pointer), and StackOffset is the known offset from that register.
+  bool IsRematerializable = false;
  RegRequirement RegRequirement = RR_MayHaveRegister;
  RegClass RegisterClass;
  /// RegNum is the allocated register, or NoRegister if it isn't

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -239,6 +239,11 @@ void TargetARM32::translateO2() {
  // TODO(stichnot): share passes with X86?
  // https://code.google.com/p/nativeclient/issues/detail?id=4094

+  // Do not merge Alloca instructions, and lay out the stack.
+  static constexpr bool SortAndCombineAllocas = false;
+  Func->processAllocas(SortAndCombineAllocas);
+  Func->dump("After Alloca processing");
+
  if (!Ctx->getFlags().getPhiEdgeSplit()) {
    // Lower Phi instructions.
    Func->placePhiLoads();
@@ -340,6 +345,11 @@ void TargetARM32::translateOm1() {

  // TODO: share passes with X86?

+  // Do not merge Alloca instructions, and lay out the stack.
+  static constexpr bool SortAndCombineAllocas = false;
+  Func->processAllocas(SortAndCombineAllocas);
+  Func->dump("After Alloca processing");
+
  Func->placePhiLoads();
  if (Func->hasError())
    return;

--- a/src/IceTargetLoweringMIPS32.cpp
+++ b/src/IceTargetLoweringMIPS32.cpp
@@ -92,6 +92,11 @@ void TargetMIPS32::translateO2() {
  // TODO(stichnot): share passes with X86?
  // https://code.google.com/p/nativeclient/issues/detail?id=4094

+  // Merge Alloca instructions, and lay out the stack.
+  static constexpr bool SortAndCombineAllocas = true;
+  Func->processAllocas(SortAndCombineAllocas);
+  Func->dump("After Alloca processing");
+
  if (!Ctx->getFlags().getPhiEdgeSplit()) {
    // Lower Phi instructions.
    Func->placePhiLoads();
@@ -187,6 +192,11 @@ void TargetMIPS32::translateOm1() {

  // TODO: share passes with X86?

+  // Do not merge Alloca instructions, and lay out the stack.
+  static constexpr bool SortAndCombineAllocas = false;
+  Func->processAllocas(SortAndCombineAllocas);
+  Func->dump("After Alloca processing");
+
  Func->placePhiLoads();
  if (Func->hasError())
    return;

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -151,8 +151,10 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
      Variable *esp =
          Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
-      StackArgLocations.push_back(
-          Traits::X86OperandMem::create(Func, Ty, esp, Loc));
+      auto *Mem = Traits::X86OperandMem::create(Func, Ty, esp, Loc);
+      // Stack stores for arguments are fixed to esp.
+      Mem->setIgnoreStackAdjust(true);
+      StackArgLocations.push_back(Mem);
      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
    }
  }

--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -735,7 +735,8 @@ template <> struct MachineTraits<TargetX8632> {
    uint16_t getShift() const { return Shift; }
    SegmentRegisters getSegmentRegister() const { return SegmentReg; }
    void emitSegmentOverride(Assembler *Asm) const;
-    Address toAsmAddress(Assembler *Asm) const;
+    Address toAsmAddress(Assembler *Asm,
+                         const Ice::TargetLowering *Target) const;

    void emit(const Cfg *Func) const override;
    using X86Operand::dump;
@@ -749,6 +750,9 @@ template <> struct MachineTraits<TargetX8632> {

    bool getRandomized() const { return Randomized; }

+    void setIgnoreStackAdjust(bool Ignore) { IgnoreStackAdjust = Ignore; }
+    bool getIgnoreStackAdjust() const { return IgnoreStackAdjust; }
+
  private:
    X86OperandMem(Cfg *Func, Type Ty, Variable *Base, Constant *Offset,
                  Variable *Index, uint16_t Shift, SegmentRegisters SegmentReg);
@@ -762,6 +766,11 @@ template <> struct MachineTraits<TargetX8632> {
    /// memory operands are generated in
    /// TargetX86Base::randomizeOrPoolImmediate()
    bool Randomized;
+    /// Memory operations involving the stack pointer need to know when the
+    /// stack pointer was moved temporarily.  Ignore that adjustment in
+    /// cases that should be pinned to the stack pointer, such as outgoing
+    /// arguments to calls.
+    bool IgnoreStackAdjust = false;
  };

  /// VariableSplit is a way to treat an f64 memory location as a pair of i32

--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -717,7 +717,8 @@ template <> struct MachineTraits<TargetX8664> {
    uint16_t getShift() const { return Shift; }
    SegmentRegisters getSegmentRegister() const { return DefaultSegment; }
    void emitSegmentOverride(Assembler *) const {}
-    Address toAsmAddress(Assembler *Asm) const;
+    Address toAsmAddress(Assembler *Asm,
+                         const Ice::TargetLowering *Target) const;

    void emit(const Cfg *Func) const override;
    using X86Operand::dump;
@@ -731,6 +732,9 @@ template <> struct MachineTraits<TargetX8664> {

    bool getRandomized() const { return Randomized; }

+    void setIgnoreStackAdjust(bool Ignore) { IgnoreStackAdjust = Ignore; }
+    bool getIgnoreStackAdjust() const { return IgnoreStackAdjust; }
+
  private:
    X86OperandMem(Cfg *Func, Type Ty, Variable *Base, Constant *Offset,
                  Variable *Index, uint16_t Shift);
@@ -743,6 +747,11 @@ template <> struct MachineTraits<TargetX8664> {
    /// memory operands are generated in
    /// TargetX86Base::randomizeOrPoolImmediate()
    bool Randomized = false;
+    /// Memory operations involving the stack pointer need to know when the
+    /// stack pointer was moved temporarily.  Ignore that adjustment in
+    /// cases that should be pinned to the stack pointer, such as outgoing
+    /// arguments to calls.
+    bool IgnoreStackAdjust = false;
  };

  /// VariableSplit is a way to treat an f64 memory location as a pair of i32

--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -237,7 +237,8 @@ protected:
    Legal_Reg = 1 << 0, // physical register, not stack location
    Legal_Imm = 1 << 1,
    Legal_Mem = 1 << 2, // includes [eax+4*ecx] as well as [esp+12]
-    Legal_All = ~Legal_None
+    Legal_Rematerializable = 1 << 3,
+    Legal_All = ~Legal_Rematerializable
  };
  using LegalMask = uint32_t;
  Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -302,6 +302,11 @@ template <class Machine> void TargetX86Base<Machine>::staticInit() {
 template <class Machine> void TargetX86Base<Machine>::translateO2() {
  TimerMarker T(TimerStack::TT_O2, Func);

+  // Merge Alloca instructions, and lay out the stack.
+  static constexpr bool SortAndCombineAllocas = true;
+  Func->processAllocas(SortAndCombineAllocas);
+  Func->dump("After Alloca processing");
+
  if (!Ctx->getFlags().getPhiEdgeSplit()) {
    // Lower Phi instructions.
    Func->placePhiLoads();
@@ -420,6 +425,11 @@ template <class Machine> void TargetX86Base<Machine>::translateO2() {
 template <class Machine> void TargetX86Base<Machine>::translateOm1() {
  TimerMarker T(TimerStack::TT_Om1, Func);

+  // Do not merge Alloca instructions, and lay out the stack.
+  static constexpr bool SortAndCombineAllocas = false;
+  Func->processAllocas(SortAndCombineAllocas);
+  Func->dump("After Alloca processing");
+
  Func->placePhiLoads();
  if (Func->hasError())
    return;
@@ -945,7 +955,7 @@ TargetX86Base<Machine>::getRegisterSet(RegSetMask Include,
 template <class Machine>
 void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) {
  if (!Inst->getKnownFrameOffset())
-    IsEbpBasedFrame = true;
+    setHasFramePointer();
  // Conservatively require the stack to be aligned. Some stack adjustment
  // operations implemented below assume that the stack is aligned before the
  // alloca. All the alloca code ensures that the stack alignment is preserved
@@ -969,6 +979,7 @@ void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) {
  uint32_t Alignment =
      std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
  if (Alignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
+    setHasFramePointer();
    _and(esp, Ctx->getConstantInt32(-Alignment));
  }
  if (const auto *ConstantTotalSize =
@@ -5500,10 +5511,12 @@ Operand *TargetX86Base<Machine>::legalize(Operand *From, LegalMask Allowed,
    Variable *RegBase = nullptr;
    Variable *RegIndex = nullptr;
    if (Base) {
-      RegBase = legalizeToReg(Base);
+      RegBase = llvm::cast<Variable>(
+          legalize(Base, Legal_Reg | Legal_Rematerializable));
    }
    if (Index) {
-      RegIndex = legalizeToReg(Index);
+      RegIndex = llvm::cast<Variable>(
+          legalize(Index, Legal_Reg | Legal_Rematerializable));
    }
    if (Base != RegBase || Index != RegIndex) {
      Mem = Traits::X86OperandMem::create(Func, Ty, RegBase, Mem->getOffset(),
@@ -5575,12 +5588,25 @@ Operand *TargetX86Base<Machine>::legalize(Operand *From, LegalMask Allowed,
    // either when the variable is pre-colored or when it is assigned infinite
    // weight.
    bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
+    bool MustRematerialize =
+        (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
    // We need a new physical register for the operand if:
-    //   Mem is not allowed and Var isn't guaranteed a physical
-    //   register, or
-    //   RegNum is required and Var->getRegNum() doesn't match.
-    if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
-        (RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) {
+    // - Mem is not allowed and Var isn't guaranteed a physical register, or
+    // - RegNum is required and Var->getRegNum() doesn't match, or
+    // - Var is a rematerializable variable and rematerializable pass-through is
+    //   not allowed (in which case we need an lea instruction).
+    if (MustRematerialize) {
+      assert(Ty == IceType_i32);
+      Variable *NewVar = makeReg(Ty, RegNum);
+      // Since Var is rematerializable, the offset will be added when the lea is
+      // emitted.
+      constexpr Constant *NoOffset = nullptr;
+      auto *Mem = Traits::X86OperandMem::create(Func, Ty, Var, NoOffset);
+      _lea(NewVar, Mem);
+      From = NewVar;
+    } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
+               (RegNum != Variable::NoRegister && RegNum != Var->getRegNum()) ||
+               MustRematerialize) {
      From = copyToReg(From, RegNum);
    }
    return From;

--- a/tests_lit/llvm2ice_tests/fused-alloca-arg.ll
+++ b/tests_lit/llvm2ice_tests/fused-alloca-arg.ll
+; This is a basic test of the alloca instruction and a call.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 -allow-externally-defined-symbols \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+declare void @copy(i32 %arg1, i8* %arr1, i8* %arr2, i8* %arr3, i8* %arr4);
+
+; Test that alloca base addresses get passed correctly to functions.
+define internal void @caller1(i32 %arg) {
+entry:
+  %a1 = alloca i8, i32 32, align 4
+  %p1 = bitcast i8* %a1 to i32*
+  store i32 %arg, i32* %p1, align 1
+  call void @copy(i32 %arg, i8* %a1, i8* %a1, i8* %a1, i8* %a1)
+  ret void
+}
+
+; CHECK-LABEL:  caller1
+; CHECK-NEXT:   sub    esp,0xc
+; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x10]
+; CHECK-NEXT:   sub    esp,0x20
+; CHECK-NEXT:   mov    ecx,esp
+; CHECK-NEXT:   mov    DWORD PTR [esp],eax
+; CHECK-NEXT:   sub    esp,0x20
+; CHECK-NEXT:   mov    DWORD PTR [esp],eax
+; CHECK-NEXT:   lea    eax,[esp+0x20]
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x4],eax
+; CHECK-NEXT:   lea    eax,[esp+0x20]
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x8],eax
+; CHECK-NEXT:   lea    eax,[esp+0x20]
+; CHECK-NEXT:   mov    DWORD PTR [esp+0xc],eax
+; CHECK-NEXT:   lea    eax,[esp+0x20]
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x10],eax
+; CHECK-NEXT:   call
+; CHECK-NEXT:   add    esp,0x20
+; CHECK-NEXT:   add    esp,0x2c
+; CHECK-NEXT:   ret
+
+; Test that alloca base addresses get passed correctly to functions.
+define internal void @caller2(i32 %arg) {
+entry:
+  %a1 = alloca i8, i32 32, align 4
+  %a2 = alloca i8, i32 32, align 4
+  %p1 = bitcast i8* %a1 to i32*
+  %p2 = bitcast i8* %a2 to i32*
+  store i32 %arg, i32* %p1, align 1
+  store i32 %arg, i32* %p2, align 1
+  call void @copy(i32 %arg, i8* %a1, i8* %a2, i8* %a1, i8* %a2)
+  ret void
+}
+
+; CHECK-LABEL:  caller2
+; CHECK-NEXT:   sub    esp,0xc
+; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x10]
+; CHECK-NEXT:   sub    esp,0x40
+; CHECK-NEXT:   mov    ecx,esp
+; CHECK-NEXT:   mov    DWORD PTR [esp],eax
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x20],eax
+; CHECK-NEXT:   sub    esp,0x20
+; CHECK-NEXT:   mov    DWORD PTR [esp],eax
+; CHECK-NEXT:   lea    eax,[esp+0x20]
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x4],eax
+; CHECK-NEXT:   lea    eax,[esp+0x40]
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x8],eax
+; CHECK-NEXT:   lea    eax,[esp+0x20]
+; CHECK-NEXT:   mov    DWORD PTR [esp+0xc],eax
+; CHECK-NEXT:   lea    eax,[esp+0x40]
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x10],eax
+; CHECK-NEXT:   call
+; CHECK-NEXT:   add    esp,0x20
+; CHECK-NEXT:   add    esp,0x4c
+; CHECK-NEXT:   ret
--- a/tests_lit/llvm2ice_tests/fused-alloca.ll
+++ b/tests_lit/llvm2ice_tests/fused-alloca.ll
+; This is a basic test of the alloca instruction.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 -allow-externally-defined-symbols \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+; Test that a sequence of allocas with less than stack alignment get fused.
+define internal void @fused_small_align(i32 %arg) {
+entry:
+  %a1 = alloca i8, i32 8, align 4
+  %a2 = alloca i8, i32 12, align 4
+  %a3 = alloca i8, i32 16, align 8
+  %p1 = bitcast i8* %a1 to i32*
+  %p2 = bitcast i8* %a2 to i32*
+  %p3 = bitcast i8* %a3 to i32*
+  store i32 %arg, i32* %p1, align 1
+  store i32 %arg, i32* %p2, align 1
+  store i32 %arg, i32* %p3, align 1
+  ret void
+}
+; CHECK-LABEL: fused_small_align
+; CHECK-NEXT: sub    esp,0xc
+; CHECK-NEXT: mov    eax,DWORD PTR [esp+0x10]
+; CHECK-NEXT: sub    esp,0x30
+; CHECK-NEXT: mov    {{.*}},esp
+; CHECK-NEXT: mov    DWORD PTR [esp+0x10],eax
+; CHECK-NEXT: mov    DWORD PTR [esp+0x18],eax
+; CHECK-NEXT: mov    DWORD PTR [esp],eax
+; CHECK-NEXT: add    esp,0x3c
+
+; Test that a sequence of allocas with greater than stack alignment get fused.
+define internal void @fused_large_align(i32 %arg) {
+entry:
+  %a1 = alloca i8, i32 8, align 32
+  %a2 = alloca i8, i32 12, align 64
+  %a3 = alloca i8, i32 16, align 32
+  %p1 = bitcast i8* %a1 to i32*
+  %p2 = bitcast i8* %a2 to i32*
+  %p3 = bitcast i8* %a3 to i32*
+  store i32 %arg, i32* %p1, align 1
+  store i32 %arg, i32* %p2, align 1
+  store i32 %arg, i32* %p3, align 1
+  ret void
+}
+; CHECK-LABEL: fused_large_align
+; CHECK-NEXT: push   ebp
+; CHECK-NEXT: mov    ebp,esp
+; CHECK-NEXT: sub    esp,0x8
+; CHECK-NEXT: mov    eax,DWORD PTR [ebp+0x8]
+; CHECK-NEXT: and    esp,0xffffffc0
+; CHECK-NEXT: sub    esp,0x80
+; CHECK-NEXT: mov    ecx,esp
+; CHECK-NEXT: mov    DWORD PTR [esp+0x40],eax
+; CHECK-NEXT: mov    DWORD PTR [esp],eax
+; CHECK-NEXT: mov    DWORD PTR [esp+0x60],eax
+; CHECK-NEXT: mov    esp,ebp
+; CHECK-NEXT: pop    ebp
--- a/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
@@ -40,7 +40,6 @@ entry:
 }
 ; CHECK-LABEL: test_fused_load_sub_a
 ;    alloca store
-; CHECK: mov {{.*}},esp
 ; CHECK: mov DWORD PTR {{.*}},0x3e7
 ;    atomic store (w/ its own mfence)
 ; The load + sub are optimized into one everywhere.
@@ -80,7 +79,6 @@ entry:
 }
 ; CHECK-LABEL: test_fused_load_sub_b
 ;    alloca store
-; CHECK: mov {{.*}},esp
 ; CHECK: mov DWORD PTR {{.*}},0x3e7
 ;    atomic store (w/ its own mfence)
 ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
@@ -121,7 +119,6 @@ entry:
 }
 ; CHECK-LABEL: test_fused_load_sub_c
 ;    alloca store
-; CHECK: mov {{.*}},esp
 ; CHECK: mov DWORD PTR {{.*}},0x3e7
 ;    atomic store (w/ its own mfence)
 ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a