Commit 4d40271e by Antonio Maiorano

Subzero: add support for large stacks on Windows

If the stack size is > 4K, emit chkstk, which probes the stack to commit the pages required to support the large stack. Bug: swiftshader:25 Change-Id: I6b9f09218736ffb641cb1dbf95a1de7149633ef8 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/41608 Presubmit-Ready: Antonio Maiorano <amaiorano@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Tested-by: 's avatarAntonio Maiorano <amaiorano@google.com>
parent e3621dca
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include <array>
#include <cmath> #include <cmath>
#include <thread> #include <thread>
#include <tuple> #include <tuple>
...@@ -1501,6 +1502,53 @@ TEST(ReactorUnitTests, Args_GreaterThan5Mixed) ...@@ -1501,6 +1502,53 @@ TEST(ReactorUnitTests, Args_GreaterThan5Mixed)
} }
} }
// This test was written because on Windows with Subzero, we would get a crash when executing a function
// with a large number of local variables. The problem was that on Windows, 4K pages are allocated as
// needed for the stack whenever an access is made in a "guard page", at which point the page is committed,
// and the next 4K page becomes the guard page. If a stack access is made that's beyond the guard page,
// a regular page fault occurs. To fix this, Subzero (and any compiler) now emits a call to __chkstk with
// the stack size in EAX, so that it can probe the stack in 4K increments up to that size, committing the
// required pages. See https://docs.microsoft.com/en-us/windows/win32/devnotes/-win32-chkstk.
TEST(ReactorUnitTests, LargeStack)
{
#if defined(_WIN32)
// An empirically large enough value to access outside the guard pages
constexpr int ArrayByteSize = 24 * 1024;
constexpr int ArraySize = ArrayByteSize / sizeof(int32_t);
FunctionT<void(int32_t * v)> function;
{
// Allocate a stack array large enough that writing to the first element will reach beyond
// the guard page.
Array<Int, ArraySize> largeStackArray;
for(int i = 0; i < ArraySize; ++i)
{
largeStackArray[i] = i;
}
Pointer<Int> in = function.Arg<0>();
for(int i = 0; i < ArraySize; ++i)
{
in[i] = largeStackArray[i];
}
}
auto routine = function("one");
std::array<int32_t, ArraySize> v;
// Run this in a thread, so that we get the default reserved stack size (8K on Win64).
std::thread t([&] {
routine(v.data());
});
t.join();
for(int i = 0; i < ArraySize; ++i)
{
EXPECT_EQ(v[i], i);
}
#endif
}
TEST(ReactorUnitTests, Call) TEST(ReactorUnitTests, Call)
{ {
struct Class struct Class
......
...@@ -17,6 +17,10 @@ ...@@ -17,6 +17,10 @@
#include "IceTargetLoweringX8632Traits.h" #include "IceTargetLoweringX8632Traits.h"
#if defined(SUBZERO_USE_MICROSOFT_ABI)
extern "C" void _chkstk();
#endif
namespace X8632 { namespace X8632 {
std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) { std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
return ::Ice::X8632::TargetX8632::create(Func); return ::Ice::X8632::TargetX8632::create(Func);
...@@ -402,6 +406,32 @@ void TargetX8632::emitSandboxedReturn() { ...@@ -402,6 +406,32 @@ void TargetX8632::emitSandboxedReturn() {
lowerIndirectJump(T_ecx); lowerIndirectJump(T_ecx);
} }
void TargetX8632::emitStackProbe(size_t StackSizeBytes) {
#if defined(SUBZERO_USE_MICROSOFT_ABI)
if (StackSizeBytes >= 4096) {
// _chkstk on Win32 is actually __alloca_probe, which adjusts ESP by the
// stack amount specified in EAX, so we save ESP in ECX, and restore them
// both after the call.
Variable *EAX = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
Variable *ESP = makeReg(IceType_i32, Traits::RegisterSet::Reg_esp);
Variable *ECX = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
_push_reg(ECX->getRegNum());
_mov(ECX, ESP);
_mov(EAX, Ctx->getConstantInt32(StackSizeBytes));
auto *CallTarget =
Ctx->getConstantInt32(reinterpret_cast<int32_t>(&_chkstk));
emitCallToTarget(CallTarget, nullptr);
_mov(ESP, ECX);
_pop_reg(ECX->getRegNum());
}
#endif
}
// In some cases, there are x-macros tables for both high-level and low-level // In some cases, there are x-macros tables for both high-level and low-level
// instructions/operands that use the same enum key value. The tables are kept // instructions/operands that use the same enum key value. The tables are kept
// separate to maintain a proper separation between abstraction layers. There // separate to maintain a proper separation between abstraction layers. There
......
...@@ -59,6 +59,7 @@ protected: ...@@ -59,6 +59,7 @@ protected:
void initSandbox(); void initSandbox();
bool legalizeOptAddrForSandbox(OptAddr *Addr); bool legalizeOptAddrForSandbox(OptAddr *Addr);
void emitSandboxedReturn(); void emitSandboxedReturn();
void emitStackProbe(size_t StackSizeBytes);
void lowerIndirectJump(Variable *JumpTarget); void lowerIndirectJump(Variable *JumpTarget);
void emitGetIP(CfgNode *Node); void emitGetIP(CfgNode *Node);
Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) override; Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) override;
......
...@@ -17,6 +17,10 @@ ...@@ -17,6 +17,10 @@
#include "IceDefs.h" #include "IceDefs.h"
#include "IceTargetLoweringX8664Traits.h" #include "IceTargetLoweringX8664Traits.h"
#if defined(SUBZERO_USE_MICROSOFT_ABI)
extern "C" void __chkstk();
#endif
namespace X8664 { namespace X8664 {
std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) { std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
return ::Ice::X8664::TargetX8664::create(Func); return ::Ice::X8664::TargetX8664::create(Func);
...@@ -758,6 +762,26 @@ void TargetX8664::emitSandboxedReturn() { ...@@ -758,6 +762,26 @@ void TargetX8664::emitSandboxedReturn() {
} }
} }
void TargetX8664::emitStackProbe(size_t StackSizeBytes) {
#if defined(SUBZERO_USE_MICROSOFT_ABI)
// Mirroring the behavior of MSVC here, which emits a _chkstk when locals are
// >= 4KB, rather than the 8KB claimed by the docs.
if (StackSizeBytes >= 4096) {
// __chkstk on Win64 probes the stack up to RSP - EAX, but does not clobber
// RSP, so we don't need to save and restore it.
Variable *EAX = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
_mov(EAX, Ctx->getConstantInt32(StackSizeBytes));
auto *CallTarget =
Ctx->getConstantInt64(reinterpret_cast<int64_t>(&__chkstk));
Operand *CallTargetReg =
legalizeToReg(CallTarget, Traits::RegisterSet::Reg_r11);
emitCallToTarget(CallTargetReg, nullptr);
}
#endif
}
// In some cases, there are x-macros tables for both high-level and low-level // In some cases, there are x-macros tables for both high-level and low-level
// instructions/operands that use the same enum key value. The tables are kept // instructions/operands that use the same enum key value. The tables are kept
// separate to maintain a proper separation between abstraction layers. There // separate to maintain a proper separation between abstraction layers. There
......
...@@ -62,6 +62,7 @@ protected: ...@@ -62,6 +62,7 @@ protected:
void initSandbox(); void initSandbox();
bool legalizeOptAddrForSandbox(OptAddr *Addr); bool legalizeOptAddrForSandbox(OptAddr *Addr);
void emitSandboxedReturn(); void emitSandboxedReturn();
void emitStackProbe(size_t StackSizeBytes);
void lowerIndirectJump(Variable *JumpTarget); void lowerIndirectJump(Variable *JumpTarget);
void emitGetIP(CfgNode *Node); void emitGetIP(CfgNode *Node);
Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) override; Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) override;
......
...@@ -376,6 +376,12 @@ protected: ...@@ -376,6 +376,12 @@ protected:
void emitSandboxedReturn() { void emitSandboxedReturn() {
dispatchToConcrete(&Traits::ConcreteTarget::emitSandboxedReturn); dispatchToConcrete(&Traits::ConcreteTarget::emitSandboxedReturn);
} }
void emitStackProbe(size_t StackSizeBytes) {
dispatchToConcrete(&Traits::ConcreteTarget::emitStackProbe,
std::move(StackSizeBytes));
}
/// Emit just the call instruction (without argument or return variable /// Emit just the call instruction (without argument or return variable
/// processing), sandboxing if needed. /// processing), sandboxing if needed.
virtual Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) = 0; virtual Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) = 0;
......
...@@ -1199,6 +1199,8 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { ...@@ -1199,6 +1199,8 @@ void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
if (SpillAreaSizeBytes) { if (SpillAreaSizeBytes) {
emitStackProbe(SpillAreaSizeBytes);
// Generate "sub stackptr, SpillAreaSizeBytes" // Generate "sub stackptr, SpillAreaSizeBytes"
_sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes)); _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment