Subzero: Align spill locations to natural alignment.

This requires sorting the spilled variables based on alignment and introducing additional padding around the spill location areas. These changes allow vector instructions to accept memory operands. Old stack frame layout: New stack frame layout: +---------------------+ +---------------------+ | return address | | return address | +---------------------+ +---------------------+ | preserved registers | | preserved registers | +---------------------+ +---------------------+ | global spill area | | padding | +---------------------+ +---------------------+ | local spill area | | global spill area | +---------------------+ +---------------------+ | padding | | padding | +---------------------+ +---------------------+ | local variables | | local spill area | +---------------------+ +---------------------+ | padding | +---------------------+ | local variables | +---------------------+ BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/465413003

Subzero: Align spill locations to natural alignment.
d4799f47 · Matt Wala · f820da5e · d4799f47 · d4799f47 · d4799f47
Commit d4799f47 authored Aug 14, 2014 by Matt Wala
4 changed files
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -134,7 +134,7 @@ public:
  virtual bool hasFramePointer() const { return false; }
  virtual SizeT getFrameOrStackReg() const = 0;
-  virtual size_t typeWidthInBytesOnStack(Type Ty) = 0;
+  virtual size_t typeWidthInBytesOnStack(Type Ty) const = 0;
  bool hasComputedFrame() const { return HasComputedFrame; }
  int32_t getStackAdjustment() const { return StackAdjustment; }
  void updateStackAdjustment(int32_t Offset) { StackAdjustment += Offset; }

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -24,6 +24,8 @@
 #include "IceTargetLoweringX8632.h"
 #include "llvm/Support/CommandLine.h"
+#include <strings.h>
 namespace Ice {
 namespace {
@@ -128,13 +130,23 @@ const uint32_t X86_CHAR_BIT = 8;
 const uint32_t X86_STACK_ALIGNMENT_BYTES = 16;
 // Size of the return address on the stack
 const uint32_t X86_RET_IP_SIZE_BYTES = 4;
+// The base 2 logarithm of the width in bytes of the smallest stack slot
+const uint32_t X86_LOG2_OF_MIN_STACK_SLOT_SIZE = 2;
+// The base 2 logarithm of the width in bytes of the largest stack slot
+const uint32_t X86_LOG2_OF_MAX_STACK_SLOT_SIZE = 4;
+// Value and Alignment are in bytes.  Return Value adjusted to the next
+// highest multiple of Alignment.
+uint32_t applyAlignment(uint32_t Value, uint32_t Alignment) {
+  // power of 2
+  assert((Alignment & (Alignment - 1)) == 0);
+  return (Value + Alignment - 1) & -Alignment;
+}
-// Value is a size in bytes.  Return Value adjusted to the next highest
+// Value is in bytes. Return Value adjusted to the next highest multiple
-// multiple of the stack alignment.
+// of the stack alignment.
 uint32_t applyStackAlignment(uint32_t Value) {
-  // power of 2
+  return applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
-  assert((X86_STACK_ALIGNMENT_BYTES & (X86_STACK_ALIGNMENT_BYTES - 1)) == 0);
-  return (Value + X86_STACK_ALIGNMENT_BYTES - 1) & -X86_STACK_ALIGNMENT_BYTES;
 }
 // Instruction set options
@@ -248,7 +260,7 @@ void __attribute__((unused)) xMacroIntegrityCheck() {
 TargetX8632::TargetX8632(Cfg *Func)
    : TargetLowering(Func), InstructionSet(CLInstructionSet),
      IsEbpBasedFrame(false), NeedsStackAlignment(false), FrameSizeLocals(0),
-      LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false),
+      SpillAreaSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false),
      PhysicalRegisters(VarList(Reg_NUM)) {
  // TODO: Don't initialize IntegerRegisters and friends every time.
  // Instead, initialize in some sort of static initializer for the
@@ -520,6 +532,30 @@ void TargetX8632::lowerArguments() {
  }
 }
+void TargetX8632::sortByAlignment(VarList &Dest, const VarList &Source) const {
+  // Sort the variables into buckets according to the log of their width
+  // in bytes.
+  const SizeT NumBuckets =
+      X86_LOG2_OF_MAX_STACK_SLOT_SIZE - X86_LOG2_OF_MIN_STACK_SLOT_SIZE + 1;
+  VarList Buckets[NumBuckets];
+  for (VarList::const_iterator I = Source.begin(), E = Source.end(); I != E;
+       ++I) {
+    Variable *Var = *I;
+    uint32_t NaturalAlignment = typeWidthInBytesOnStack(Var->getType());
+    SizeT LogNaturalAlignment = ffs(NaturalAlignment) - 1;
+    assert(LogNaturalAlignment >= X86_LOG2_OF_MIN_STACK_SLOT_SIZE);
+    assert(LogNaturalAlignment <= X86_LOG2_OF_MAX_STACK_SLOT_SIZE);
+    SizeT BucketIndex = LogNaturalAlignment - X86_LOG2_OF_MIN_STACK_SLOT_SIZE;
+    Buckets[BucketIndex].push_back(Var);
+  }
+  for (SizeT I = 0, E = NumBuckets; I < E; ++I) {
+    VarList &List = Buckets[NumBuckets - I - 1];
+    Dest.insert(Dest.end(), List.begin(), List.end());
+  }
+}
 // Helper function for addProlog().
 //
 // This assumes Arg is an argument passed on the stack.  This sets the
@@ -563,6 +599,35 @@ void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
 Type TargetX8632::stackSlotType() { return IceType_i32; }
 void TargetX8632::addProlog(CfgNode *Node) {
+  // Stack frame layout:
+  //
+  // +------------------------+
+  // | 1. return address      |
+  // +------------------------+
+  // | 2. preserved registers |
+  // +------------------------+
+  // | 3. padding             |
+  // +------------------------+
+  // | 4. global spill area   |
+  // +------------------------+
+  // | 5. padding             |
+  // +------------------------+
+  // | 6. local spill area    |
+  // +------------------------+
+  // | 7. padding             |
+  // +------------------------+
+  // | 8. allocas             |
+  // +------------------------+
+  //
+  // The following variables record the size in bytes of the given areas:
+  //  * X86_RET_IP_SIZE_BYTES:  area 1
+  //  * PreservedRegsSizeBytes: area 2
+  //  * SpillAreaPaddingBytes:  area 3
+  //  * GlobalsSize:            area 4
+  //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
+  //  * LocalsSpillAreaSize:    area 6
+  //  * SpillAreaSizeBytes:     areas 3 - 7
  // If SimpleCoalescing is false, each variable without a register
  // gets its own unique stack slot, which leads to large stack
  // frames.  If SimpleCoalescing is true, then each "global" variable
@@ -573,7 +638,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
  const bool SimpleCoalescing = true;
  size_t InArgsSizeBytes = 0;
  size_t PreservedRegsSizeBytes = 0;
-  LocalsSizeBytes = 0;
+  SpillAreaSizeBytes = 0;
  Context.init(Node);
  Context.setInsertPoint(Context.getCur());
@@ -595,10 +660,19 @@ void TargetX8632::addProlog(CfgNode *Node) {
  std::vector<size_t> LocalsSize(Func->getNumNodes());
  // Prepass.  Compute RegsUsed, PreservedRegsSizeBytes, and
-  // LocalsSizeBytes.
+  // SpillAreaSizeBytes.
  RegsUsed = llvm::SmallBitVector(CalleeSaves.size());
  const VarList &Variables = Func->getVariables();
  const VarList &Args = Func->getArgs();
+  VarList SpilledVariables, SortedSpilledVariables,
+      VariablesLinkedToSpillSplots;
+  // If there is a separate locals area, this specifies the alignment
+  // for it.
+  uint32_t LocalsSlotsAlignmentBytes = 0;
+  // The entire spill locations area gets aligned to largest natural
+  // alignment of the variables that have a spill slot.
+  uint32_t SpillAreaAlignmentBytes = 0;
  for (VarList::const_iterator I = Variables.begin(), E = Variables.end();
       I != E; ++I) {
    Variable *Var = *I;
@@ -617,25 +691,42 @@ void TargetX8632::addProlog(CfgNode *Node) {
    // that stack slot.
    if (Var->getWeight() == RegWeight::Zero && Var->getRegisterOverlap()) {
      if (Variable *Linked = Var->getPreferredRegister()) {
-        if (!Linked->hasReg())
+        if (!Linked->hasReg()) {
+          VariablesLinkedToSpillSplots.push_back(Var);
          continue;
+        }
      }
    }
+    SpilledVariables.push_back(Var);
+  }
+  SortedSpilledVariables.reserve(SpilledVariables.size());
+  sortByAlignment(SortedSpilledVariables, SpilledVariables);
+  for (VarList::const_iterator I = SortedSpilledVariables.begin(),
+                               E = SortedSpilledVariables.end();
+       I != E; ++I) {
+    Variable *Var = *I;
    size_t Increment = typeWidthInBytesOnStack(Var->getType());
+    if (!SpillAreaAlignmentBytes)
+      SpillAreaAlignmentBytes = Increment;
    if (SimpleCoalescing) {
      if (Var->isMultiblockLife()) {
        GlobalsSize += Increment;
      } else {
        SizeT NodeIndex = Var->getLocalUseNode()->getIndex();
        LocalsSize[NodeIndex] += Increment;
-        if (LocalsSize[NodeIndex] > LocalsSizeBytes)
+        if (LocalsSize[NodeIndex] > SpillAreaSizeBytes)
-          LocalsSizeBytes = LocalsSize[NodeIndex];
+          SpillAreaSizeBytes = LocalsSize[NodeIndex];
+        if (!LocalsSlotsAlignmentBytes)
+          LocalsSlotsAlignmentBytes = Increment;
      }
    } else {
-      LocalsSizeBytes += Increment;
+      SpillAreaSizeBytes += Increment;
    }
  }
-  LocalsSizeBytes += GlobalsSize;
+  uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
+  SpillAreaSizeBytes += GlobalsSize;
  // Add push instructions for preserved registers.
  for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
@@ -658,17 +749,40 @@ void TargetX8632::addProlog(CfgNode *Node) {
    _mov(ebp, esp);
  }
+  // Align the variables area. SpillAreaPaddingBytes is the size of
+  // the region after the preserved registers and before the spill
+  // areas.
+  uint32_t SpillAreaPaddingBytes = 0;
+  if (SpillAreaAlignmentBytes) {
+    assert(SpillAreaAlignmentBytes <= X86_STACK_ALIGNMENT_BYTES);
+    uint32_t PaddingStart = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
+    uint32_t SpillAreaStart =
+        applyAlignment(PaddingStart, SpillAreaAlignmentBytes);
+    SpillAreaPaddingBytes = SpillAreaStart - PaddingStart;
+    SpillAreaSizeBytes += SpillAreaPaddingBytes;
+  }
+  // If there are separate globals and locals areas, make sure the
+  // locals area is aligned by padding the end of the globals area.
+  uint32_t GlobalsAndSubsequentPaddingSize = GlobalsSize;
+  if (LocalsSlotsAlignmentBytes) {
+    assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
+    GlobalsAndSubsequentPaddingSize =
+        applyAlignment(GlobalsSize, LocalsSlotsAlignmentBytes);
+    SpillAreaSizeBytes += GlobalsAndSubsequentPaddingSize - GlobalsSize;
+  }
+  // Align esp if necessary.
  if (NeedsStackAlignment) {
-    uint32_t StackSize = applyStackAlignment(
+    uint32_t StackOffset = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
-        X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes + LocalsSizeBytes);
+    uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
-    LocalsSizeBytes =
+    SpillAreaSizeBytes = StackSize - StackOffset;
-        StackSize - X86_RET_IP_SIZE_BYTES - PreservedRegsSizeBytes;
  }
-  // Generate "sub esp, LocalsSizeBytes"
+  // Generate "sub esp, SpillAreaSizeBytes"
-  if (LocalsSizeBytes)
+  if (SpillAreaSizeBytes)
    _sub(getPhysicalRegister(Reg_esp),
-         Ctx->getConstantInt(IceType_i32, LocalsSizeBytes));
+         Ctx->getConstantInt(IceType_i32, SpillAreaSizeBytes));
  resetStackAdjustment();
@@ -678,7 +792,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
  Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
  size_t BasicFrameOffset = PreservedRegsSizeBytes + X86_RET_IP_SIZE_BYTES;
  if (!IsEbpBasedFrame)
-    BasicFrameOffset += LocalsSizeBytes;
+    BasicFrameOffset += SpillAreaSizeBytes;
  unsigned NumXmmArgs = 0;
  for (SizeT i = 0; i < Args.size(); ++i) {
@@ -692,40 +806,24 @@ void TargetX8632::addProlog(CfgNode *Node) {
  }
  // Fill in stack offsets for locals.
-  size_t TotalGlobalsSize = GlobalsSize;
+  size_t GlobalsSpaceUsed = SpillAreaPaddingBytes;
-  GlobalsSize = 0;
  LocalsSize.assign(LocalsSize.size(), 0);
-  size_t NextStackOffset = 0;
+  size_t NextStackOffset = GlobalsSpaceUsed;
-  for (VarList::const_iterator I = Variables.begin(), E = Variables.end();
+  for (VarList::const_iterator I = SortedSpilledVariables.begin(),
+                               E = SortedSpilledVariables.end();
       I != E; ++I) {
    Variable *Var = *I;
-    if (Var->hasReg()) {
-      RegsUsed[Var->getRegNum()] = true;
-      continue;
-    }
-    if (Var->getIsArg())
-      continue;
-    if (ComputedLiveRanges && Var->getLiveRange().isEmpty())
-      continue;
-    if (Var->getWeight() == RegWeight::Zero && Var->getRegisterOverlap()) {
-      if (Variable *Linked = Var->getPreferredRegister()) {
-        if (!Linked->hasReg()) {
-          // TODO: Make sure Linked has already been assigned a stack
-          // slot.
-          Var->setStackOffset(Linked->getStackOffset());
-          continue;
-        }
-      }
-    }
    size_t Increment = typeWidthInBytesOnStack(Var->getType());
    if (SimpleCoalescing) {
      if (Var->isMultiblockLife()) {
-        GlobalsSize += Increment;
+        GlobalsSpaceUsed += Increment;
-        NextStackOffset = GlobalsSize;
+        NextStackOffset = GlobalsSpaceUsed;
      } else {
        SizeT NodeIndex = Var->getLocalUseNode()->getIndex();
        LocalsSize[NodeIndex] += Increment;
-        NextStackOffset = TotalGlobalsSize + LocalsSize[NodeIndex];
+        NextStackOffset = SpillAreaPaddingBytes +
+                          GlobalsAndSubsequentPaddingSize +
+                          LocalsSize[NodeIndex];
      }
    } else {
      NextStackOffset += Increment;
@@ -733,18 +831,45 @@ void TargetX8632::addProlog(CfgNode *Node) {
    if (IsEbpBasedFrame)
      Var->setStackOffset(-NextStackOffset);
    else
-      Var->setStackOffset(LocalsSizeBytes - NextStackOffset);
+      Var->setStackOffset(SpillAreaSizeBytes - NextStackOffset);
  }
-  this->FrameSizeLocals = NextStackOffset;
+  this->FrameSizeLocals = NextStackOffset - SpillAreaPaddingBytes;
  this->HasComputedFrame = true;
+  // Assign stack offsets to variables that have been linked to spilled
+  // variables.
+  for (VarList::const_iterator I = VariablesLinkedToSpillSplots.begin(),
+                               E = VariablesLinkedToSpillSplots.end();
+       I != E; ++I) {
+    Variable *Var = *I;
+    Variable *Linked = Var->getPreferredRegister();
+    Var->setStackOffset(Linked->getStackOffset());
+  }
  if (Func->getContext()->isVerbose(IceV_Frame)) {
-    Func->getContext()->getStrDump() << "LocalsSizeBytes=" << LocalsSizeBytes
+    Ostream &Str = Func->getContext()->getStrDump();
-                                     << "\n"
-                                     << "InArgsSizeBytes=" << InArgsSizeBytes
+    Str << "Stack layout:\n";
-                                     << "\n"
+    uint32_t EspAdjustmentPaddingSize =
-                                     << "PreservedRegsSizeBytes="
+        SpillAreaSizeBytes - LocalsSpillAreaSize -
-                                     << PreservedRegsSizeBytes << "\n";
+        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
+    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
+        << " return address = " << X86_RET_IP_SIZE_BYTES << " bytes\n"
+        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
+        << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
+        << " globals spill area = " << GlobalsSize << " bytes\n"
+        << " globals-locals spill areas intermediate padding = "
+        << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
+        << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
+        << " esp alignment padding = " << EspAdjustmentPaddingSize
+        << " bytes\n";
+    Str << "Stack details:\n"
+        << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
+        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
+        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
+        << " bytes\n"
+        << " is ebp based = " << IsEbpBasedFrame << "\n";
  }
 }
@@ -771,9 +896,9 @@ void TargetX8632::addEpilog(CfgNode *Node) {
    _mov(esp, ebp);
    _pop(ebp);
  } else {
-    // add esp, LocalsSizeBytes
+    // add esp, SpillAreaSizeBytes
-    if (LocalsSizeBytes)
+    if (SpillAreaSizeBytes)
-      _add(esp, Ctx->getConstantInt(IceType_i32, LocalsSizeBytes));
+      _add(esp, Ctx->getConstantInt(IceType_i32, SpillAreaSizeBytes));
  }
  // Add pop instructions for preserved registers.
@@ -991,8 +1116,7 @@ void TargetX8632::lowerAlloca(const InstAlloca *Inst) {
  if (ConstantInteger *ConstantTotalSize =
          llvm::dyn_cast<ConstantInteger>(TotalSize)) {
    uint32_t Value = ConstantTotalSize->getValue();
-    // Round Value up to the next highest multiple of the alignment.
+    Value = applyAlignment(Value, Alignment);
-    Value = (Value + Alignment - 1) & -Alignment;
    _sub(esp, Ctx->getConstantInt(IceType_i32, Value));
  } else {
    // Non-constant sizes need to be adjusted to the next highest
@@ -1239,12 +1363,6 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
  } else if (isVectorType(Dest->getType())) {
    // TODO: Trap on integer divide and integer modulo by zero.
    // See: https://code.google.com/p/nativeclient/issues/detail?id=3899
-    //
-    // TODO(wala): ALIGNHACK: All vector arithmetic is currently done in
-    // registers.  This is a workaround of the fact that there is no
-    // support for aligning stack operands.  Once there is support,
-    // remove LEGAL_HACK.
-#define LEGAL_HACK(s) legalizeToVar((s))
    switch (Inst->getOp()) {
    case InstArithmetic::_num:
      llvm_unreachable("Unknown arithmetic operator");
@@ -1252,31 +1370,31 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
    case InstArithmetic::Add: {
      Variable *T = makeReg(Dest->getType());
      _movp(T, Src0);
-      _padd(T, LEGAL_HACK(Src1));
+      _padd(T, Src1);
      _movp(Dest, T);
    } break;
    case InstArithmetic::And: {
      Variable *T = makeReg(Dest->getType());
      _movp(T, Src0);
-      _pand(T, LEGAL_HACK(Src1));
+      _pand(T, Src1);
      _movp(Dest, T);
    } break;
    case InstArithmetic::Or: {
      Variable *T = makeReg(Dest->getType());
      _movp(T, Src0);
-      _por(T, LEGAL_HACK(Src1));
+      _por(T, Src1);
      _movp(Dest, T);
    } break;
    case InstArithmetic::Xor: {
      Variable *T = makeReg(Dest->getType());
      _movp(T, Src0);
-      _pxor(T, LEGAL_HACK(Src1));
+      _pxor(T, Src1);
      _movp(Dest, T);
    } break;
    case InstArithmetic::Sub: {
      Variable *T = makeReg(Dest->getType());
      _movp(T, Src0);
-      _psub(T, LEGAL_HACK(Src1));
+      _psub(T, Src1);
      _movp(Dest, T);
    } break;
    case InstArithmetic::Mul: {
@@ -1287,7 +1405,7 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
      if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
        Variable *T = makeReg(Dest->getType());
        _movp(T, Src0);
-        _pmull(T, LEGAL_HACK(Src1));
+        _pmull(T, Src1);
        _movp(Dest, T);
      } else if (Dest->getType() == IceType_v4i32) {
        // Lowering sequence:
@@ -1320,14 +1438,9 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
        Variable *T3 = makeReg(IceType_v4i32);
        Variable *T4 = makeReg(IceType_v4i32);
        _movp(T1, Src0);
-        // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R
+        _pshufd(T2, Src0, Mask1030);
-        // with Src1 after stack operand alignment support is
+        _pshufd(T3, Src1, Mask1030);
-        // implemented.
+        _pmuludq(T1, Src1);
-        Variable *Src0R = LEGAL_HACK(Src0);
-        Variable *Src1R = LEGAL_HACK(Src1);
-        _pshufd(T2, Src0R, Mask1030);
-        _pshufd(T3, Src1R, Mask1030);
-        _pmuludq(T1, Src1R);
        _pmuludq(T2, T3);
        _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));
        _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));
@@ -1349,32 +1462,31 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
    case InstArithmetic::Fadd: {
      Variable *T = makeReg(Dest->getType());
      _movp(T, Src0);
-      _addps(T, LEGAL_HACK(Src1));
+      _addps(T, Src1);
      _movp(Dest, T);
    } break;
    case InstArithmetic::Fsub: {
      Variable *T = makeReg(Dest->getType());
      _movp(T, Src0);
-      _subps(T, LEGAL_HACK(Src1));
+      _subps(T, Src1);
      _movp(Dest, T);
    } break;
    case InstArithmetic::Fmul: {
      Variable *T = makeReg(Dest->getType());
      _movp(T, Src0);
-      _mulps(T, LEGAL_HACK(Src1));
+      _mulps(T, Src1);
      _movp(Dest, T);
    } break;
    case InstArithmetic::Fdiv: {
      Variable *T = makeReg(Dest->getType());
      _movp(T, Src0);
-      _divps(T, LEGAL_HACK(Src1));
+      _divps(T, Src1);
      _movp(Dest, T);
    } break;
    case InstArithmetic::Frem:
      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
      break;
    }
-#undef LEGAL_HACK
  } else { // Dest->getType() is non-i64 scalar
    Variable *T_edx = NULL;
    Variable *T = NULL;
@@ -2199,22 +2311,15 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
    _pextr(ExtractedElementR, SourceVectR, Mask);
  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
    // Use pshufd and movd/movss.
-    //
-    // ALIGNHACK: Force vector operands to registers in instructions
-    // that require aligned memory operands until support for data
-    // alignment is implemented.
-#define ALIGN_HACK(Vect) legalizeToVar((Vect))
-    Operand *SourceVectRM =
-        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
    Variable *T = NULL;
    if (Index) {
      // The shuffle only needs to occur if the element to be extracted
      // is not at the lowest index.
      Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
      T = makeReg(Ty);
-      _pshufd(T, ALIGN_HACK(SourceVectRM), Mask);
+      _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
    } else {
-      T = ALIGN_HACK(SourceVectRM);
+      T = legalizeToVar(SourceVectNotLegalized);
    }
    if (InVectorElementTy == IceType_i32) {
@@ -2228,7 +2333,6 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
      Context.insert(InstFakeDef::create(Func, ExtractedElementR));
      _movss(ExtractedElementR, T);
    }
-#undef ALIGN_HACK
  } else {
    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
    // Spill the value to a stack slot and do the extraction in memory.
@@ -2287,23 +2391,18 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
      Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      // ALIGNHACK: Without support for data alignment, both operands to
-      // cmpps need to be forced into registers.  Once support for data
-      // alignment is implemented, remove LEGAL_HACK.
-#define LEGAL_HACK(Vect) legalizeToVar((Vect))
      switch (Condition) {
      default: {
        InstX8632Cmpps::CmppsCond Predicate = TableFcmp[Index].Predicate;
        assert(Predicate != InstX8632Cmpps::Cmpps_Invalid);
        T = makeReg(Src0RM->getType());
        _movp(T, Src0RM);
-        _cmpps(T, LEGAL_HACK(Src1RM), Predicate);
+        _cmpps(T, Src1RM, Predicate);
      } break;
      case InstFcmp::One: {
        // Check both unequal and ordered.
        T = makeReg(Src0RM->getType());
        Variable *T2 = makeReg(Src0RM->getType());
-        Src1RM = LEGAL_HACK(Src1RM);
        _movp(T, Src0RM);
        _cmpps(T, Src1RM, InstX8632Cmpps::Cmpps_neq);
        _movp(T2, Src0RM);
@@ -2314,7 +2413,6 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
        // Check both equal or unordered.
        T = makeReg(Src0RM->getType());
        Variable *T2 = makeReg(Src0RM->getType());
-        Src1RM = LEGAL_HACK(Src1RM);
        _movp(T, Src0RM);
        _cmpps(T, Src1RM, InstX8632Cmpps::Cmpps_eq);
        _movp(T2, Src0RM);
@@ -2322,7 +2420,6 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
        _por(T, T2);
      } break;
      }
-#undef LEGAL_HACK
    }
    _movp(Dest, T);
@@ -2427,10 +2524,6 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
      Src1RM = T1;
    }
-    // TODO: ALIGNHACK: Both operands to compare instructions need to be
-    // in registers until data alignment support is implemented.  Once
-    // there is support for data alignment, LEGAL_HACK can be removed.
-#define LEGAL_HACK(Vect) legalizeToVar((Vect))
    Variable *T = makeReg(Ty);
    switch (Condition) {
    default:
@@ -2438,42 +2531,41 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
      break;
    case InstIcmp::Eq: {
      _movp(T, Src0RM);
-      _pcmpeq(T, LEGAL_HACK(Src1RM));
+      _pcmpeq(T, Src1RM);
    } break;
    case InstIcmp::Ne: {
      _movp(T, Src0RM);
-      _pcmpeq(T, LEGAL_HACK(Src1RM));
+      _pcmpeq(T, Src1RM);
      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
      _pxor(T, MinusOne);
    } break;
    case InstIcmp::Ugt:
    case InstIcmp::Sgt: {
      _movp(T, Src0RM);
-      _pcmpgt(T, LEGAL_HACK(Src1RM));
+      _pcmpgt(T, Src1RM);
    } break;
    case InstIcmp::Uge:
    case InstIcmp::Sge: {
      // !(Src1RM > Src0RM)
      _movp(T, Src1RM);
-      _pcmpgt(T, LEGAL_HACK(Src0RM));
+      _pcmpgt(T, Src0RM);
      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
      _pxor(T, MinusOne);
    } break;
    case InstIcmp::Ult:
    case InstIcmp::Slt: {
      _movp(T, Src1RM);
-      _pcmpgt(T, LEGAL_HACK(Src0RM));
+      _pcmpgt(T, Src0RM);
    } break;
    case InstIcmp::Ule:
    case InstIcmp::Sle: {
      // !(Src0RM > Src1RM)
      _movp(T, Src0RM);
-      _pcmpgt(T, LEGAL_HACK(Src1RM));
+      _pcmpgt(T, Src1RM);
      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
      _pxor(T, MinusOne);
    } break;
    }
-#undef LEGAL_HACK
    _movp(Dest, T);
    eliminateNextVectorSextInstruction(Dest);
@@ -2649,12 +2741,7 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
    Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index - 1]);
    Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index - 1]);
-    // ALIGNHACK: Force vector operands to registers in instructions
-    // that require aligned memory operands until support for data
-    // alignment is implemented.
-#define ALIGN_HACK(Vect) legalizeToVar((Vect))
    if (Index == 1) {
-      SourceVectRM = ALIGN_HACK(SourceVectRM);
      _shufps(ElementR, SourceVectRM, Mask1Constant);
      _shufps(ElementR, SourceVectRM, Mask2Constant);
      _movp(Inst->getDest(), ElementR);
@@ -2665,7 +2752,6 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
      _shufps(T, ElementR, Mask2Constant);
      _movp(Inst->getDest(), T);
    }
-#undef ALIGN_HACK
  } else {
    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
    // Spill the value to a stack slot and perform the insertion in
@@ -3627,10 +3713,6 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
    Variable *T = makeReg(SrcTy);
    Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
    Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
-    // ALIGNHACK: Until data alignment support is implemented, vector
-    // instructions need to have vector operands in registers.  Once
-    // there is support for data alignment, LEGAL_HACK can be removed.
-#define LEGAL_HACK(Vect) legalizeToVar((Vect))
    if (InstructionSet >= SSE4_1) {
      // TODO(wala): If the condition operand is a constant, use blendps
      // or pblendw.
@@ -3643,7 +3725,7 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
        _movp(xmm0, ConditionRM);
        _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31));
        _movp(T, SrcFRM);
-        _blendvps(T, LEGAL_HACK(SrcTRM), xmm0);
+        _blendvps(T, SrcTRM, xmm0);
        _movp(Dest, T);
      } else {
        assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
@@ -3652,7 +3734,7 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
        Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0);
        lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
        _movp(T, SrcFRM);
-        _pblendvb(T, LEGAL_HACK(SrcTRM), xmm0);
+        _pblendvb(T, SrcTRM, xmm0);
        _movp(Dest, T);
      }
      return;
@@ -3676,11 +3758,10 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
      _movp(T, ConditionRM);
    }
    _movp(T2, T);
-    _pand(T, LEGAL_HACK(SrcTRM));
+    _pand(T, SrcTRM);
-    _pandn(T2, LEGAL_HACK(SrcFRM));
+    _pandn(T2, SrcFRM);
    _por(T, T2);
    _movp(Dest, T);
-#undef LEGAL_HACK
    return;
  }

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -40,7 +40,7 @@ public:
  virtual SizeT getFrameOrStackReg() const {
    return IsEbpBasedFrame ? Reg_ebp : Reg_esp;
  }
-  virtual size_t typeWidthInBytesOnStack(Type Ty) {
+  virtual size_t typeWidthInBytesOnStack(Type Ty) const {
    // Round up to the next multiple of 4 bytes.  In particular, i1,
    // i8, and i16 are rounded up to 4 bytes.
    return (typeWidthInBytes(Ty) + 3) & ~3;
@@ -125,6 +125,8 @@ protected:
  void scalarizeArithmetic(InstArithmetic::OpKind K, Variable *Dest,
                           Operand *Src0, Operand *Src1);
+  void sortByAlignment(VarList &Dest, const VarList &Source) const;
  // Operand legalization helpers.  To deal with address mode
  // constraints, the helpers will create a new Operand and emit
  // instructions that guarantee that the Operand kind is one of those
@@ -458,7 +460,7 @@ protected:
  bool IsEbpBasedFrame;
  bool NeedsStackAlignment;
  size_t FrameSizeLocals;
-  size_t LocalsSizeBytes;
+  size_t SpillAreaSizeBytes;
  llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
  llvm::SmallBitVector ScratchRegs;
  llvm::SmallBitVector RegsUsed;

--- a/tests_lit/llvm2ice_tests/align-spill-locations.ll
+++ b/tests_lit/llvm2ice_tests/align-spill-locations.ll
+; This checks to ensure that Subzero aligns spill slots.
+; RUN: %llvm2ice --verbose none %s | FileCheck  %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck  %s
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; The location of the stack slot for a variable is inferred from the
+; return sequence.
+; In this file, "global" refers to a variable with a live range across
+; multiple basic blocks (not an LLVM global variable) and "local"
+; refers to a variable that is live in only a single basic block.
+define <4 x i32> @align_global_vector(i32 %arg) {
+entry:
+  %vec.global = insertelement <4 x i32> undef, i32 %arg, i32 0
+  br label %block
+block:
+  call void @ForceXmmSpills()
+  ret <4 x i32> %vec.global
+; CHECK-LABEL: align_global_vector:
+; CHECK: movups xmm0, xmmword ptr [esp]
+; CHECK-NEXT: add esp, 28
+; CHECK-NEXT: ret
+}
+define <4 x i32> @align_local_vector(i32 %arg) {
+entry:
+  br label %block
+block:
+  %vec.local = insertelement <4 x i32> undef, i32 %arg, i32 0
+  call void @ForceXmmSpills()
+  ret <4 x i32> %vec.local
+; CHECK-LABEL: align_local_vector:
+; CHECK: movups xmm0, xmmword ptr [esp]
+; CHECK-NEXT: add esp, 28
+; CHECK-NEXT: ret
+}
+declare void @ForceXmmSpills()
+define <4 x i32> @align_global_vector_ebp_based(i32 %arg) {
+entry:
+  %alloc = alloca i8, i32 1, align 1
+  %vec.global = insertelement <4 x i32> undef, i32 %arg, i32 0
+  br label %block
+block:
+  call void @ForceXmmSpillsAndUseAlloca(i8* %alloc)
+  ret <4 x i32> %vec.global
+; CHECK-LABEL: align_global_vector_ebp_based:
+; CHECK: movups xmm0, xmmword ptr [ebp-24]
+; CHECK-NEXT: mov esp, ebp
+; CHECK-NEXT: pop ebp
+; CHECK: ret
+}
+define <4 x i32> @align_local_vector_ebp_based(i32 %arg) {
+entry:
+  %alloc = alloca i8, i32 1, align 1
+  %vec.local = insertelement <4 x i32> undef, i32 %arg, i32 0
+  call void @ForceXmmSpillsAndUseAlloca(i8* %alloc)
+  ret <4 x i32> %vec.local
+; CHECK-LABEL: align_local_vector_ebp_based:
+; CHECK: movups xmm0, xmmword ptr [ebp-24]
+; CHECK-NEXT: mov esp, ebp
+; CHECK-NEXT: pop ebp
+; CHECK: ret
+}
+define <4 x i32> @align_local_vector_and_global_float(i32 %arg) {
+entry:
+  %float.global = sitofp i32 %arg to float
+  call void @ForceXmmSpillsAndUseFloat(float %float.global)
+  br label %block
+block:
+  %vec.local = insertelement <4 x i32> undef, i32 undef, i32 0
+  call void @ForceXmmSpillsAndUseFloat(float %float.global)
+  ret <4 x i32> %vec.local
+; CHECK-LABEL: align_local_vector_and_global_float:
+; CHECK: cvtsi2ss xmm0, eax
+; CHECK-NEXT: movss dword ptr [esp+28], xmm0
+; CHECK: movups xmm0, xmmword ptr [esp]
+; CHECK-NEXT: add esp, 44
+; CHECK-NEXT: ret
+}
+declare void @ForceXmmSpillsAndUseAlloca(i8*)
+declare void @ForceXmmSpillsAndUseFloat(float)
+; ERRORS-NOT: ICE translation error