Commit 0fc611f1 by Ben Clayton

Reactor: Add Gather and Scatter instructions.

Use these for a fast-path for Load() and Store(). This is an attempt to fix the severe performance hit we incurred with robustness. Bug: b/131224163 Change-Id: I3e244bed5ed723cf29538ff022781c813caaa5eb Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/29454Tested-by: 's avatarBen Clayton <bclayton@google.com> Presubmit-Ready: Ben Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com>
parent 9e4bc1ba
...@@ -239,25 +239,31 @@ namespace sw ...@@ -239,25 +239,31 @@ namespace sw
T Load(Pointer ptr, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */) T Load(Pointer ptr, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
{ {
using EL = typename Element<T>::type; using EL = typename Element<T>::type;
T out;
auto offsets = ptr.offsets(); auto offsets = ptr.offsets();
mask &= CmpLT(offsets + SIMD::Int(sizeof(float) - 1), SIMD::Int(ptr.limit)); // Disable OOB reads. mask &= CmpLT(offsets + SIMD::Int(sizeof(float) - 1), SIMD::Int(ptr.limit)); // Disable OOB reads.
if (!atomic && order == std::memory_order_relaxed)
{
return rr::Gather(rr::Pointer<EL>(ptr.base), offsets, mask, sizeof(float));
}
else
{
T out;
auto anyLanesDisabled = AnyFalse(mask); auto anyLanesDisabled = AnyFalse(mask);
If(ptr.hasEqualOffsets() && !anyLanesDisabled) If(ptr.hasEqualOffsets() && !anyLanesDisabled)
{ {
// Load one, replicate. // Load one, replicate.
auto offset = Extract(offsets, 0); auto offset = Extract(offsets, 0);
out = T(Load(rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order)); out = T(rr::Load(rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order));
} }
Else If(ptr.hasSequentialOffsets() && !anyLanesDisabled) Else If(ptr.hasSequentialOffsets() && !anyLanesDisabled)
{ {
// Load all elements in a single SIMD instruction. // Load all elements in a single SIMD instruction.
auto offset = Extract(offsets, 0); auto offset = Extract(offsets, 0);
out = Load(rr::Pointer<T>(&ptr.base[offset]), sizeof(float), atomic, order); out = rr::Load(rr::Pointer<T>(&ptr.base[offset]), sizeof(float), atomic, order);
} }
Else Else
{ {
// Divergent offsets or masked lanes - load each element individually. // Divergent offsets or masked lanes.
out = T(0); out = T(0);
for (int i = 0; i < SIMD::Width; i++) for (int i = 0; i < SIMD::Width; i++)
{ {
...@@ -271,6 +277,7 @@ namespace sw ...@@ -271,6 +277,7 @@ namespace sw
} }
return out; return out;
} }
}
template<typename T> template<typename T>
void Store(Pointer ptr, T val, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */) void Store(Pointer ptr, T val, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
...@@ -278,6 +285,12 @@ namespace sw ...@@ -278,6 +285,12 @@ namespace sw
using EL = typename Element<T>::type; using EL = typename Element<T>::type;
auto offsets = ptr.offsets(); auto offsets = ptr.offsets();
mask &= CmpLT(offsets + SIMD::Int(sizeof(float) - 1), SIMD::Int(ptr.limit)); // Disable OOB reads. mask &= CmpLT(offsets + SIMD::Int(sizeof(float) - 1), SIMD::Int(ptr.limit)); // Disable OOB reads.
if (!atomic && order == std::memory_order_relaxed)
{
return rr::Scatter(rr::Pointer<EL>(ptr.base), val, offsets, mask, sizeof(float));
}
else
{
auto anyLanesDisabled = AnyFalse(mask); auto anyLanesDisabled = AnyFalse(mask);
If(ptr.hasSequentialOffsets() && !anyLanesDisabled) If(ptr.hasSequentialOffsets() && !anyLanesDisabled)
{ {
...@@ -298,6 +311,7 @@ namespace sw ...@@ -298,6 +311,7 @@ namespace sw
} }
} }
} }
}
} // namespace SIMD } // namespace SIMD
......
...@@ -882,7 +882,17 @@ namespace rr ...@@ -882,7 +882,17 @@ namespace rr
#error "unknown architecture" #error "unknown architecture"
#endif #endif
llvm::SmallVector<std::string, 1> mattrs; llvm::SmallVector<std::string, 8> mattrs;
llvm::StringMap<bool> features;
bool ok = llvm::sys::getHostCPUFeatures(features);
ASSERT_MSG(ok, "llvm::sys::getHostCPUFeatures returned false");
for (auto &feature : features)
{
if (feature.second) { mattrs.push_back(feature.first()); }
}
#if 0
#if defined(__i386__) || defined(__x86_64__) #if defined(__i386__) || defined(__x86_64__)
mattrs.push_back(CPUID::supportsMMX() ? "+mmx" : "-mmx"); mattrs.push_back(CPUID::supportsMMX() ? "+mmx" : "-mmx");
mattrs.push_back(CPUID::supportsCMOV() ? "+cmov" : "-cmov"); mattrs.push_back(CPUID::supportsCMOV() ? "+cmov" : "-cmov");
...@@ -899,6 +909,7 @@ namespace rr ...@@ -899,6 +909,7 @@ namespace rr
// might fail to link. // might fail to link.
#endif #endif
#endif #endif
#endif
llvm::TargetOptions targetOpts; llvm::TargetOptions targetOpts;
targetOpts.UnsafeFPMath = false; targetOpts.UnsafeFPMath = false;
...@@ -1299,6 +1310,55 @@ namespace rr ...@@ -1299,6 +1310,55 @@ namespace rr
} }
} }
Value *Nucleus::createGather(Value *base, Type *elTy, Value *offsets, Value *mask, unsigned int alignment)
{
ASSERT(V(base)->getType()->isPointerTy());
ASSERT(V(offsets)->getType()->isVectorTy());
ASSERT(V(mask)->getType()->isVectorTy());
auto numEls = V(mask)->getType()->getVectorNumElements();
auto i1Ty = ::llvm::Type::getInt1Ty(*::context);
auto i32Ty = ::llvm::Type::getInt32Ty(*::context);
auto i8Ty = ::llvm::Type::getInt8Ty(*::context);
auto i8PtrTy = i8Ty->getPointerTo();
auto elPtrTy = T(elTy)->getPointerTo();
auto elVecTy = ::llvm::VectorType::get(T(elTy), numEls);
auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
auto i8Base = ::builder->CreatePointerCast(V(base), i8PtrTy);
auto i8Ptrs = ::builder->CreateGEP(i8Base, V(offsets));
auto elPtrs = ::builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
auto i8Mask = ::builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
auto passthrough = ::llvm::Constant::getNullValue(elVecTy);
auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
auto func = ::llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy } );
return V(::builder->CreateCall(func, { elPtrs, align, i8Mask, passthrough }));
}
void Nucleus::createScatter(Value *base, Value *val, Value *offsets, Value *mask, unsigned int alignment)
{
ASSERT(V(base)->getType()->isPointerTy());
ASSERT(V(val)->getType()->isVectorTy());
ASSERT(V(offsets)->getType()->isVectorTy());
ASSERT(V(mask)->getType()->isVectorTy());
auto numEls = V(mask)->getType()->getVectorNumElements();
auto i1Ty = ::llvm::Type::getInt1Ty(*::context);
auto i32Ty = ::llvm::Type::getInt32Ty(*::context);
auto i8Ty = ::llvm::Type::getInt8Ty(*::context);
auto i8PtrTy = i8Ty->getPointerTo();
auto elVecTy = V(val)->getType();
auto elTy = elVecTy->getVectorElementType();
auto elPtrTy = elTy->getPointerTo();
auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
auto i8Base = ::builder->CreatePointerCast(V(base), i8PtrTy);
auto i8Ptrs = ::builder->CreateGEP(i8Base, V(offsets));
auto elPtrs = ::builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
auto i8Mask = ::builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
auto func = ::llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy } );
::builder->CreateCall(func, { V(val), elPtrs, align, i8Mask });
}
Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex) Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
......
...@@ -100,6 +100,10 @@ namespace rr ...@@ -100,6 +100,10 @@ namespace rr
static Value *createStore(Value *value, Value *ptr, Type *type, bool isVolatile = false, unsigned int aligment = 0, bool atomic = false, std::memory_order memoryOrder = std::memory_order_relaxed); static Value *createStore(Value *value, Value *ptr, Type *type, bool isVolatile = false, unsigned int aligment = 0, bool atomic = false, std::memory_order memoryOrder = std::memory_order_relaxed);
static Value *createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex); static Value *createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex);
// Scatter / Gather instructions
static Value *createGather(Value *base, Type *elementType, Value *offsets, Value *mask, unsigned int alignment);
static void createScatter(Value *base, Value *value, Value *offsets, Value *mask, unsigned int alignment);
// Atomic instructions // Atomic instructions
static Value *createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed); static Value *createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
static Value *createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed); static Value *createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
......
...@@ -4212,4 +4212,25 @@ namespace rr ...@@ -4212,4 +4212,25 @@ namespace rr
Nucleus::createCondBr(cmp.value, bodyBB, endBB); Nucleus::createCondBr(cmp.value, bodyBB, endBB);
Nucleus::setInsertBlock(bodyBB); Nucleus::setInsertBlock(bodyBB);
} }
RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
{
return RValue<Float4>(Nucleus::createGather(base.value, Float::getType(), offsets.value, mask.value, alignment));
}
RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
{
return RValue<Int4>(Nucleus::createGather(base.value, Int::getType(), offsets.value, mask.value, alignment));
}
void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
{
Nucleus::createScatter(base.value, val.value, offsets.value, mask.value, alignment);
}
void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
{
Nucleus::createScatter(base.value, val.value, offsets.value, mask.value, alignment);
}
} }
...@@ -2355,6 +2355,12 @@ namespace rr ...@@ -2355,6 +2355,12 @@ namespace rr
return Load(RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder); return Load(RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
} }
// TODO: Use SIMD to template these.
RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
template<typename T> template<typename T>
void Store(RValue<T> value, RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder) void Store(RValue<T> value, RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
{ {
......
...@@ -3453,6 +3453,8 @@ namespace rr ...@@ -3453,6 +3453,8 @@ namespace rr
// Below are functions currently unimplemented for the Subzero backend. // Below are functions currently unimplemented for the Subzero backend.
// They are stubbed to satisfy the linker. // They are stubbed to satisfy the linker.
Value *Nucleus::createGather(Value *base, Type *elTy, Value *offsets, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createGather()"); return nullptr; }
void Nucleus::createScatter(Value *base, Value *val, Value *offsets, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createScatter()"); }
RValue<Float4> Sin(RValue<Float4> x) { UNIMPLEMENTED("Subzero Sin()"); return Float4(0); } RValue<Float4> Sin(RValue<Float4> x) { UNIMPLEMENTED("Subzero Sin()"); return Float4(0); }
RValue<Float4> Cos(RValue<Float4> x) { UNIMPLEMENTED("Subzero Cos()"); return Float4(0); } RValue<Float4> Cos(RValue<Float4> x) { UNIMPLEMENTED("Subzero Cos()"); return Float4(0); }
RValue<Float4> Tan(RValue<Float4> x) { UNIMPLEMENTED("Subzero Tan()"); return Float4(0); } RValue<Float4> Tan(RValue<Float4> x) { UNIMPLEMENTED("Subzero Tan()"); return Float4(0); }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment