Commit e81e8b3c by Antonio Maiorano

Subzero: fix calling C functions on Windows x64

This change addresses the following issues: * Microsoft x64 ABI assigns registers to the first four arguments by argument position, not by type count. * Microsoft x64 ABI expects caller to allocate space to copy 4 register arguments to stack, called the Shadow Store or Home Space. * Fix bug where preserved register area size was not computed correctly when Xmm registers were being preserved, as it was assuming all preserved registers were 8 bytes large. Bug: b/142132927 Change-Id: Ibc2d82ab117c062eed2e7f66109c9d6bbdc09a8b Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/37272Reviewed-by: 's avatarBen Clayton <bclayton@google.com> Reviewed-by: 's avatarAlexis Hétu <sugoi@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Tested-by: 's avatarAntonio Maiorano <amaiorano@google.com>
parent 01a7adc9
...@@ -160,7 +160,7 @@ namespace rr ...@@ -160,7 +160,7 @@ namespace rr
{ {
const Capabilities Caps = const Capabilities Caps =
{ {
false, // CallSupported true, // CallSupported
false, // CoroutinesSupported false, // CoroutinesSupported
}; };
......
...@@ -694,6 +694,12 @@ public: ...@@ -694,6 +694,12 @@ public:
(void)ArgNum; (void)ArgNum;
return RegNumT(); return RegNumT();
} }
// Given the absolute argument position and argument position by type, return
// the register index to assign it to.
static SizeT getArgIndex(SizeT argPos, SizeT argPosByType) {
(void)argPos;
return argPosByType;
};
/// The number of bits in a byte /// The number of bits in a byte
static constexpr uint32_t X86_CHAR_BIT = 8; static constexpr uint32_t X86_CHAR_BIT = 8;
......
...@@ -742,6 +742,15 @@ public: ...@@ -742,6 +742,15 @@ public:
assert(Ty == IceType_i64 || Ty == IceType_i32); assert(Ty == IceType_i64 || Ty == IceType_i32);
return getGprForType(Ty, GprForArgNum[ArgNum]); return getGprForType(Ty, GprForArgNum[ArgNum]);
} }
// Given the absolute argument position and argument position by type, return
// the register index to assign it to.
static SizeT getArgIndex(SizeT argPos, SizeT argPosByType) {
// Microsoft x64 ABI: register is selected by arg position (e.g. 1st int as
// 2nd param goes into 2nd int reg)
(void)argPosByType;
return argPos;
};
#else #else
// System V x86-64 calling convention: // System V x86-64 calling convention:
// //
...@@ -774,6 +783,12 @@ public: ...@@ -774,6 +783,12 @@ public:
assert(Ty == IceType_i64 || Ty == IceType_i32); assert(Ty == IceType_i64 || Ty == IceType_i32);
return getGprForType(Ty, GprForArgNum[ArgNum]); return getGprForType(Ty, GprForArgNum[ArgNum]);
} }
// Given the absolute argument position and argument position by type, return
// the register index to assign it to.
static SizeT getArgIndex(SizeT argPos, SizeT argPosByType) {
(void)argPos;
return argPosByType;
}
#endif #endif
/// Whether scalar floating point arguments are passed in XMM registers /// Whether scalar floating point arguments are passed in XMM registers
......
...@@ -996,9 +996,9 @@ template <typename TraitsType> ...@@ -996,9 +996,9 @@ template <typename TraitsType>
void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
// Stack frame layout: // Stack frame layout:
// //
// +------------------------+ // +------------------------+ ^ +
// | 1. return address | // | 1. return address | |
// +------------------------+ // +------------------------+ v -
// | 2. preserved registers | // | 2. preserved registers |
// +------------------------+ <--- BasePointer (if used) // +------------------------+ <--- BasePointer (if used)
// | 3. padding | // | 3. padding |
...@@ -1011,6 +1011,8 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { ...@@ -1011,6 +1011,8 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
// +------------------------+ // +------------------------+
// | 7. padding | // | 7. padding |
// +------------------------+ // +------------------------+
// | 7.5 shadow (WinX64) |
// +------------------------+
// | 8. allocas | // | 8. allocas |
// +------------------------+ // +------------------------+
// | 9. padding | // | 9. padding |
...@@ -1040,6 +1042,17 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { ...@@ -1040,6 +1042,17 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
// space on the frame for globals (variables with multi-block lifetime), and // space on the frame for globals (variables with multi-block lifetime), and
// one block to share for locals (single-block lifetime). // one block to share for locals (single-block lifetime).
// The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte
// "shadow store" (aka "home space") so that the callee may copy the 4
// register args to it.
#if defined(SUBZERO_USE_MICROSOFT_ABI)
const SizeT ShadowStoreSize = Traits::Is64Bit ? 4 * typeWidthInBytes(Traits::WordType) : 0;
#else
const SizeT ShadowStoreSize = 0;
#endif
// StackPointer: points just past return address of calling function
Context.init(Node); Context.init(Node);
Context.setInsertPoint(Context.getCur()); Context.setInsertPoint(Context.getCur());
...@@ -1092,11 +1105,17 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { ...@@ -1092,11 +1105,17 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
for (RegNumT RegNum : RegNumBVIter(Pushed)) { for (RegNumT RegNum : RegNumBVIter(Pushed)) {
assert(RegNum == Traits::getBaseReg(RegNum)); assert(RegNum == Traits::getBaseReg(RegNum));
++NumCallee; ++NumCallee;
if (Traits::isXmm(RegNum)) {
PreservedRegsSizeBytes += 16;
} else {
PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType); PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
}
_push_reg(RegNum); _push_reg(RegNum);
} }
Ctx->statsUpdateRegistersSaved(NumCallee); Ctx->statsUpdateRegistersSaved(NumCallee);
// StackPointer: points past preserved registers at start of spill area
// Generate "push frameptr; mov frameptr, stackptr" // Generate "push frameptr; mov frameptr, stackptr"
if (IsEbpBasedFrame) { if (IsEbpBasedFrame) {
assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)) assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
...@@ -1148,20 +1167,29 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { ...@@ -1148,20 +1167,29 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
if (PrologEmitsFixedAllocas) if (PrologEmitsFixedAllocas)
SpillAreaSizeBytes += FixedAllocaSizeBytes; SpillAreaSizeBytes += FixedAllocaSizeBytes;
// Win64 ABI: add space for shadow store (aka home space)
SpillAreaSizeBytes += ShadowStoreSize;
// Entering the function has made the stack pointer unaligned. Re-align it by // Entering the function has made the stack pointer unaligned. Re-align it by
// adjusting the stack size. // adjusting the stack size.
uint32_t StackOffset = Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes; // Note that StackOffset does not include spill area. It's the offset from the
// base stack pointer (epb), whether we set it or not, to the the first stack
// arg (if any). StackSize, on the other hand, does include the spill area.
const uint32_t StackOffset =
ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes, uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
RequiredStackAlignment); RequiredStackAlignment);
StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(), StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
RequiredStackAlignment); RequiredStackAlignment);
SpillAreaSizeBytes = StackSize - StackOffset; SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
if (SpillAreaSizeBytes) { if (SpillAreaSizeBytes) {
// Generate "sub stackptr, SpillAreaSizeBytes" // Generate "sub stackptr, SpillAreaSizeBytes"
_sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes)); _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
} }
// StackPointer: points just past the spill area (end of stack frame)
// If the required alignment is greater than the stack pointer's guaranteed // If the required alignment is greater than the stack pointer's guaranteed
// alignment, align the stack pointer accordingly. // alignment, align the stack pointer accordingly.
if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) { if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
...@@ -1170,6 +1198,8 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { ...@@ -1170,6 +1198,8 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
Ctx->getConstantInt32(-RequiredStackAlignment)); Ctx->getConstantInt32(-RequiredStackAlignment));
} }
// StackPointer: may have just been offset for alignment
// Account for known-frame-offset alloca instructions that were not already // Account for known-frame-offset alloca instructions that were not already
// combined into the prolog. // combined into the prolog.
if (!PrologEmitsFixedAllocas) if (!PrologEmitsFixedAllocas)
...@@ -1182,8 +1212,7 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { ...@@ -1182,8 +1212,7 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
// Arg[0] is closest to the stack/frame pointer. // Arg[0] is closest to the stack/frame pointer.
RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg(); RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType); Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
size_t BasicFrameOffset = size_t BasicFrameOffset = StackOffset;
PreservedRegsSizeBytes + Traits::X86_RET_IP_SIZE_BYTES;
if (!IsEbpBasedFrame) if (!IsEbpBasedFrame)
BasicFrameOffset += SpillAreaSizeBytes; BasicFrameOffset += SpillAreaSizeBytes;
...@@ -1193,22 +1222,26 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { ...@@ -1193,22 +1222,26 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
size_t InArgsSizeBytes = 0; size_t InArgsSizeBytes = 0;
unsigned NumXmmArgs = 0; unsigned NumXmmArgs = 0;
unsigned NumGPRArgs = 0; unsigned NumGPRArgs = 0;
for (Variable *Arg : Args) { for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
Variable *Arg = Args[i];
// Skip arguments passed in registers. // Skip arguments passed in registers.
if (isVectorType(Arg->getType())) { if (isVectorType(Arg->getType())) {
if (Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) { if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
.hasValue()) {
++NumXmmArgs; ++NumXmmArgs;
continue; continue;
} }
} else if (isScalarFloatingType(Arg->getType())) { } else if (isScalarFloatingType(Arg->getType())) {
if (Traits::X86_PASS_SCALAR_FP_IN_XMM && if (Traits::X86_PASS_SCALAR_FP_IN_XMM &&
Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) { Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
.hasValue()) {
++NumXmmArgs; ++NumXmmArgs;
continue; continue;
} }
} else { } else {
assert(isScalarIntegerType(Arg->getType())); assert(isScalarIntegerType(Arg->getType()));
if (Traits::getRegisterForGprArgNum(Traits::WordType, NumGPRArgs) if (Traits::getRegisterForGprArgNum(Traits::WordType,
Traits::getArgIndex(i, NumGPRArgs))
.hasValue()) { .hasValue()) {
++NumGPRArgs; ++NumGPRArgs;
continue; continue;
...@@ -1551,7 +1584,8 @@ void TargetX86Base<TraitsType>::lowerArguments() { ...@@ -1551,7 +1584,8 @@ void TargetX86Base<TraitsType>::lowerArguments() {
Variable *RegisterArg = nullptr; Variable *RegisterArg = nullptr;
RegNumT RegNum; RegNumT RegNum;
if (isVectorType(Ty)) { if (isVectorType(Ty)) {
RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs); RegNum =
Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
if (RegNum.hasNoValue()) { if (RegNum.hasNoValue()) {
XmmSlotsRemain = false; XmmSlotsRemain = false;
continue; continue;
...@@ -1562,7 +1596,8 @@ void TargetX86Base<TraitsType>::lowerArguments() { ...@@ -1562,7 +1596,8 @@ void TargetX86Base<TraitsType>::lowerArguments() {
if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) { if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
continue; continue;
} }
RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs); RegNum =
Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
if (RegNum.hasNoValue()) { if (RegNum.hasNoValue()) {
XmmSlotsRemain = false; XmmSlotsRemain = false;
continue; continue;
...@@ -1570,7 +1605,8 @@ void TargetX86Base<TraitsType>::lowerArguments() { ...@@ -1570,7 +1605,8 @@ void TargetX86Base<TraitsType>::lowerArguments() {
++NumXmmArgs; ++NumXmmArgs;
RegisterArg = Func->makeVariable(Ty); RegisterArg = Func->makeVariable(Ty);
} else if (isScalarIntegerType(Ty)) { } else if (isScalarIntegerType(Ty)) {
RegNum = Traits::getRegisterForGprArgNum(Ty, NumGprArgs); RegNum = Traits::getRegisterForGprArgNum(
Ty, Traits::getArgIndex(i, NumGprArgs));
if (RegNum.hasNoValue()) { if (RegNum.hasNoValue()) {
GprSlotsRemain = false; GprSlotsRemain = false;
continue; continue;
...@@ -2617,11 +2653,14 @@ void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) { ...@@ -2617,11 +2653,14 @@ void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) {
RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment, RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
Traits::X86_STACK_ALIGNMENT_BYTES); Traits::X86_STACK_ALIGNMENT_BYTES);
using OperandList = constexpr SizeT MaxOperands =
llvm::SmallVector<Operand *, constexprMax(Traits::X86_MAX_XMM_ARGS, constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS);
Traits::X86_MAX_GPR_ARGS)>; using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
OperandList XmmArgs; OperandList XmmArgs;
llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
CfgVector<std::pair<const Type, Operand *>> GprArgs; CfgVector<std::pair<const Type, Operand *>> GprArgs;
CfgVector<SizeT> GprArgIndices;
OperandList StackArgs, StackArgLocations; OperandList StackArgs, StackArgLocations;
uint32_t ParameterAreaSizeBytes = 0; uint32_t ParameterAreaSizeBytes = 0;
...@@ -2633,14 +2672,22 @@ void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) { ...@@ -2633,14 +2672,22 @@ void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) {
// The PNaCl ABI requires the width of arguments to be at least 32 bits. // The PNaCl ABI requires the width of arguments to be at least 32 bits.
assert(typeWidthInBytes(Ty) >= 4); assert(typeWidthInBytes(Ty) >= 4);
if (isVectorType(Ty) && if (isVectorType(Ty) &&
Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) { Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size()))
.hasValue()) {
XmmArgs.push_back(Arg); XmmArgs.push_back(Arg);
XmmArgIndices.push_back(i);
} else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM && } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) { Traits::getRegisterForXmmArgNum(
Traits::getArgIndex(i, XmmArgs.size()))
.hasValue()) {
XmmArgs.push_back(Arg); XmmArgs.push_back(Arg);
XmmArgIndices.push_back(i);
} else if (isScalarIntegerType(Ty) && } else if (isScalarIntegerType(Ty) &&
Traits::getRegisterForGprArgNum(Ty, GprArgs.size()).hasValue()) { Traits::getRegisterForGprArgNum(
Ty, Traits::getArgIndex(i, GprArgs.size()))
.hasValue()) {
GprArgs.emplace_back(Ty, Arg); GprArgs.emplace_back(Ty, Arg);
GprArgIndices.push_back(i);
} else { } else {
// Place on stack. // Place on stack.
StackArgs.push_back(Arg); StackArgs.push_back(Arg);
...@@ -2678,16 +2725,18 @@ void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) { ...@@ -2678,16 +2725,18 @@ void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) {
} }
// Copy arguments to be passed in registers to the appropriate registers. // Copy arguments to be passed in registers to the appropriate registers.
for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) { for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
XmmArgs[i] = XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
legalizeToReg(legalize(XmmArgs[i]), Traits::getRegisterForXmmArgNum(i)); Traits::getRegisterForXmmArgNum(
Traits::getArgIndex(XmmArgIndices[i], i)));
} }
// Materialize moves for arguments passed in GPRs. // Materialize moves for arguments passed in GPRs.
for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) { for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
const Type SignatureTy = GprArgs[i].first; const Type SignatureTy = GprArgs[i].first;
Operand *Arg = Operand *Arg =
legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable); legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
GprArgs[i].second = GprArgs[i].second = legalizeToReg(
legalizeToReg(Arg, Traits::getRegisterForGprArgNum(Arg->getType(), i)); Arg, Traits::getRegisterForGprArgNum(
Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i)));
assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32); assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
assert(SignatureTy == Arg->getType()); assert(SignatureTy == Arg->getType());
(void)SignatureTy; (void)SignatureTy;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment