diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl new file mode 100644 index 0000000000..b252b34379 --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts.hlsl @@ -0,0 +1,88 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_ + +#include +#include +#include + + +#if (__cplusplus >= 202002L && __cpp_concepts) + +#define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__> +#define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__) +#define NBL_CONCEPT_BODY(...) { __VA_ARGS__ }; +#define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__; +#define NBL_REQUIRES(...) requires __VA_ARGS__ + +#include + +namespace nbl +{ +namespace hlsl +{ +namespace concepts +{ + +// Alias some of the std concepts in nbl. As this is C++20 only, we don't need to use +// the macros here. +template +concept same_as = std::same_as; + +template +concept derived_from = std::derived_from; + +template +concept convertible_to = std::convertible_to; + +template +concept assignable_from = std::assignable_from; + +template +concept common_with = std::common_with; + +template +concept integral = std::integral; + +template +concept signed_integral = std::signed_integral; + +template +concept unsigned_integral = std::unsigned_integral; + +template +concept floating_point = std::floating_point; + + +// Some other useful concepts. + +template +concept any_of = (same_as || ...); + +template +concept scalar = floating_point || integral; + +template +concept vectorial = is_vector::value; + +template +concept matricial = is_matrix::value; + +} +} +} + +#else + +// No C++20 support. Do nothing. +#define NBL_CONCEPT_TYPE_PARAMS(...) +#define NBL_CONCEPT_SIGNATURE(NAME, ...) +#define NBL_CONCEPT_BODY(...) +#define NBL_REQUIRES(...) + +#endif + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl new file mode 100644 index 0000000000..5d9bce06da --- /dev/null +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -0,0 +1,167 @@ +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/property_pool/transfer.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace property_pools +{ + +[[vk::push_constant]] TransferDispatchInfo globals; + +template +struct TransferLoop +{ + void iteration(uint propertyId, TransferRequest transferRequest, uint64_t invocationIndex) + { + const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2; + const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2; + + // Fill: Always use offset 0 on src + const uint64_t srcOffset = Fill ? 0 : invocationIndex * transferRequest.propertySize; + const uint64_t dstOffset = invocationIndex * transferRequest.propertySize; + + // IOTA: Use the index as the fetching offset + // Non IOTA: Read the address buffer ("index buffer") to select fetching offset + uint64_t srcAddressBufferOffset; + uint64_t dstAddressBufferOffset; + + if (SrcIndexIota) srcAddressBufferOffset = srcOffset; + else + { + if (SrcIndexSizeLog2 == 0) {} // we can't read individual byte + else if (SrcIndexSizeLog2 == 1) srcAddressBufferOffset = vk::RawBufferLoad(transferRequest.srcIndexAddr + srcOffset * sizeof(uint16_t)); + else if (SrcIndexSizeLog2 == 2) srcAddressBufferOffset = vk::RawBufferLoad(transferRequest.srcIndexAddr + srcOffset * sizeof(uint32_t)); + else if (SrcIndexSizeLog2 == 3) srcAddressBufferOffset = vk::RawBufferLoad(transferRequest.srcIndexAddr + srcOffset * sizeof(uint64_t)); + } + + if (DstIndexIota) dstAddressBufferOffset = dstOffset; + else + { + if (DstIndexSizeLog2 == 0) {} // we can't read individual byte + else if (DstIndexSizeLog2 == 1) dstAddressBufferOffset = vk::RawBufferLoad(transferRequest.dstIndexAddr + dstOffset * sizeof(uint16_t)); + else if (DstIndexSizeLog2 == 2) dstAddressBufferOffset = vk::RawBufferLoad(transferRequest.dstIndexAddr + dstOffset * sizeof(uint32_t)); + else if (DstIndexSizeLog2 == 3) dstAddressBufferOffset = vk::RawBufferLoad(transferRequest.dstIndexAddr + dstOffset * sizeof(uint64_t)); + } + + const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize; + const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize; + + vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); + } + + void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + { + uint64_t elementCount = uint64_t(transferRequest.elementCount32) + | uint64_t(transferRequest.elementCountExtra) << 32; + uint64_t lastInvocation = min(elementCount, dispatchInfo.endOffset); + for (uint64_t invocationIndex = dispatchInfo.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize) + { + iteration(propertyId, transferRequest, invocationIndex); + } + } +}; + +// For creating permutations of the functions based on parameters that are constant over the transfer request +// These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any +// branching within them +// +// Permutations: +// 2 (fill or not) * 2 (src index iota or not) * 2 (dst index iota or not) * 4 (src index size) * 4 (dst index size) +// Total amount of permutations: 128 + +template +struct TransferLoopPermutationSrcIndexSizeLog +{ + void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + { + if (transferRequest.dstIndexSizeLog2 == 0) { TransferLoop loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.dstIndexSizeLog2 == 1) { TransferLoop loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.dstIndexSizeLog2 == 2) { TransferLoop loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ { TransferLoop loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + } +}; + +template +struct TransferLoopPermutationDstIota +{ + void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + { + if (transferRequest.srcIndexSizeLog2 == 0) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.srcIndexSizeLog2 == 1) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.srcIndexSizeLog2 == 2) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + } +}; + +template +struct TransferLoopPermutationSrcIota +{ + void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + { + bool dstIota = transferRequest.dstIndexAddr == 0; + if (dstIota) { TransferLoopPermutationDstIota loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else { TransferLoopPermutationDstIota loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + } +}; + +template +struct TransferLoopPermutationFill +{ + void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + { + bool srcIota = transferRequest.srcIndexAddr == 0; + if (srcIota) { TransferLoopPermutationSrcIota loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else { TransferLoopPermutationSrcIota loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + } +}; + +// Loading transfer request from the pointer (can't use struct +// with BDA on HLSL SPIRV) +static TransferRequest TransferRequest::newFromAddress(const uint64_t transferCmdAddr) +{ + TransferRequest transferRequest; + transferRequest.srcAddr = vk::RawBufferLoad(transferCmdAddr,8); + transferRequest.dstAddr = vk::RawBufferLoad(transferCmdAddr + sizeof(uint64_t),8); + transferRequest.srcIndexAddr = vk::RawBufferLoad(transferCmdAddr + sizeof(uint64_t) * 2,8); + transferRequest.dstIndexAddr = vk::RawBufferLoad(transferCmdAddr + sizeof(uint64_t) * 3,8); + // Remaining elements are part of the same bitfield + // TODO: Do this only using raw buffer load? + uint64_t bitfieldType = vk::RawBufferLoad(transferCmdAddr + sizeof(uint64_t) * 4,8); + transferRequest.elementCount32 = uint32_t(bitfieldType); + transferRequest.elementCountExtra = uint32_t(bitfieldType >> 32); + transferRequest.propertySize = uint32_t(bitfieldType >> (32 + 3)); + transferRequest.fill = uint32_t(bitfieldType >> (32 + 3 + 24)); + transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1)); + transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1 + 2)); + + return transferRequest; +} + +template +void main(uint32_t3 dispatchId, const uint dispatchSize) +{ + const uint propertyId = dispatchId.y; + const uint invocationIndex = dispatchId.x; + + uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof(TransferRequest) * propertyId; + TransferRequest transferRequest = TransferRequest::newFromAddress(transferCmdAddr); + + const bool fill = transferRequest.fill == 1; + + if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); } + else { TransferLoopPermutationFill loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); } +} + +} +} +} + +[numthreads(nbl::hlsl::property_pools::OptimalDispatchSize,1,1)] +void main(uint32_t3 dispatchId : SV_DispatchThreadID) +{ + nbl::hlsl::property_pools::main(dispatchId, nbl::hlsl::property_pools::OptimalDispatchSize); +} + diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl new file mode 100644 index 0000000000..d83a3453c7 --- /dev/null +++ b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl @@ -0,0 +1,63 @@ +#ifndef _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_ +#define _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace property_pools +{ + +struct TransferRequest +{ + // This represents a transfer command/request + uint64_t srcAddr; + uint64_t dstAddr; + uint64_t srcIndexAddr; // IOTA default + uint64_t dstIndexAddr; // IOTA default + // TODO: go back to this ideal layout when things work + // (Getting a fatal error from DXC when using 64-bit bitfields:) + // fatal error: generated SPIR-V is invalid: [VUID-StandaloneSpirv-Base-04781] Expected 32-bit int type for Base operand: BitFieldInsert + // %58 = OpBitFieldInsert %ulong %42 %57 %uint_0 %uint_35 + // + //uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers + //uint64_t propertySize : 24; // all the leftover bits (just use bytes now) + //uint64_t fill : 1; + //// 0=uint8, 1=uint16, 2=uint32, 3=uint64 + //uint64_t srcIndexSizeLog2 : 2; + //uint64_t dstIndexSizeLog2 : 2; + uint32_t elementCount32; // 32 first bits + uint32_t elementCountExtra : 3; // 3 last bits + uint32_t propertySize : 24; + uint32_t fill: 1; + uint32_t srcIndexSizeLog2 : 2; + uint32_t dstIndexSizeLog2 : 2; + + // Reads a TransferRequest from a BDA + static TransferRequest newFromAddress(const uint64_t address); +}; + +struct TransferDispatchInfo +{ + // BDA address (GPU pointer) into the transfer commands buffer + uint64_t transferCommandsAddress; + // Define the range of invocations (X axis) that will be transfered over in this dispatch + // May be sectioned off in the case of overflow or any other situation that doesn't allow + // for a full transfer + uint64_t beginOffset; + uint64_t endOffset; +}; + +NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128; + +// TODO: instead use some sort of replace function for getting optimal size? +NBL_CONSTEXPR uint32_t OptimalDispatchSize = 256; + +} +} +} + +#endif + diff --git a/include/nbl/core/alloc/address_allocator_traits.h b/include/nbl/core/alloc/address_allocator_traits.h index 293dc3503e..f6c522bc53 100644 --- a/include/nbl/core/alloc/address_allocator_traits.h +++ b/include/nbl/core/alloc/address_allocator_traits.h @@ -53,6 +53,18 @@ namespace nbl::core } } + static inline void multi_alloc_addr(AddressAlloc& alloc, uint32_t count, size_type* outAddresses, const size_type* bytes, + const size_type alignment, const size_type* hint=nullptr) noexcept + { + for (uint32_t i=0; i::value>::multi_alloc_addr( + alloc,std::min(count-i,maxMultiOps),outAddresses+i,bytes+i,alignment,hint ? (hint+i):nullptr); + } + static inline void multi_free_addr(AddressAlloc& alloc, uint32_t count, const size_type* addr, const size_type* bytes) noexcept { for (uint32_t i=0; i + +namespace nbl::video +{ + +// address allocator gives offsets +// reserved allocator allocates memory to keep the address allocator state inside +template> +class SubAllocatedDescriptorSet : public core::IReferenceCounted +{ + public: + using AddressAllocator = AddrAllocator; + using ReservedAllocator = ReservAllocator; + using size_type = typename AddressAllocator::size_type; + using value_type = typename AddressAllocator::size_type; + static constexpr value_type invalid_value = AddressAllocator::invalid_address; + + // constructors + template + inline SubAllocatedDescriptorSet(const std::span bindings, + ReservedAllocator&& _reservedAllocator, const value_type maxAllocatableAlignment, Args&&... args) + { + auto allocatableDescriptors = 0; + m_allocatableRanges.reserve(bindings.size()); + + for (auto& binding : bindings) + { + SubAllocDescriptorSetRange range; + range.offset = allocatableDescriptors; + range.binding = binding; + // Only bindings with these flags will be allocatable + if (binding.createFlags.hasFlags(core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) + | IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT + | IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT)) + { + allocatableDescriptors += binding.count; + } + m_allocatableRanges.push_back(range); + } + + m_addressAllocator = AddrAllocator( + _reservedAllocator.allocate(AddressAllocator::reserved_size(maxAllocatableAlignment, static_cast(allocatableDescriptors), args...), _NBL_SIMD_ALIGNMENT), + static_cast(0), 0u, maxAllocatableAlignment, static_cast(allocatableDescriptors), std::forward(args)... + ); + m_reservedAllocator = ReservedAllocator(std::move(_reservedAllocator)); + m_reservedSize = allocatableDescriptors; + } + // version with default constructed reserved allocator + template + explicit inline SubAllocatedDescriptorSet(const std::span bindings, + const value_type maxAllocatableAlignment, Args&&... args) : + SubAllocatedDescriptorSet(bindings,ReservedAllocator(),maxAllocatableAlignment,std::forward(args)...) + { + } + ~SubAllocatedDescriptorSet() + { + auto ptr = reinterpret_cast(core::address_allocator_traits::getReservedSpacePtr(m_addressAllocator)); + m_reservedAllocator.deallocate(const_cast(ptr),m_reservedSize); + } + + // anyone gonna use it? + inline const AddressAllocator& getAddressAllocator() const {return m_addressAllocator;} + + // + inline ReservedAllocator& getReservedAllocator() {return m_reservedAllocator;} + + // main methods + + //! Warning `outAddresses` needs to be primed with `invalid_value` values, otherwise no allocation happens for elements not equal to `invalid_value` + template + inline void multi_allocate(uint32_t count, value_type* outAddresses, const size_type* sizes, const Args&... args) + { + core::address_allocator_traits::multi_alloc_addr(m_addressAllocator,count,outAddresses,sizes,1,args...); + } + inline void multi_deallocate(uint32_t count, const size_type* addr, const size_type* sizes) + { + core::address_allocator_traits::multi_free_addr(m_addressAllocator,count,addr,sizes); + } + + // to conform to IBufferAllocator concept + template + inline value_type allocate(const size_type bytes, const size_type alignment, const Args&... args) + { + value_type retval = invalid_value; + multi_allocate(&retval,&bytes,&alignment,args...); + return retval; + } + template + inline void deallocate(value_type& allocation, Args&&... args) + { + multi_deallocate(std::forward(args)...); + allocation = invalid_value; + } + + protected: + AddressAllocator m_addressAllocator; + ReservedAllocator m_reservedAllocator; + size_t m_reservedSize; // FIXME: uninitialized variable + + struct SubAllocDescriptorSetRange { + uint32_t offset; + video::IGPUDescriptorSetLayout::SBinding binding; + }; + std::vector m_allocatableRanges = {}; +}; + +} + +#endif + diff --git a/include/nbl/video/utilities/CPropertyPoolHandler.h b/include/nbl/video/utilities/CPropertyPoolHandler.h index b4423b0e0b..f1a6e6da2e 100644 --- a/include/nbl/video/utilities/CPropertyPoolHandler.h +++ b/include/nbl/video/utilities/CPropertyPoolHandler.h @@ -12,18 +12,13 @@ #include "nbl/video/utilities/IDescriptorSetCache.h" #include "nbl/video/utilities/IPropertyPool.h" +#include "glm/glm/glm.hpp" +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/property_pool/transfer.hlsl" namespace nbl::video { -#if 0 // TODO: port -#define int int32_t -#define uint uint32_t -#include "nbl/builtin/glsl/property_pool/transfer.glsl" -#undef uint -#undef int -static_assert(NBL_BUILTIN_PROPERTY_POOL_INVALID==IPropertyPool::invalid); - // property pool factory is externally synchronized // TODO: could rename to CSparseStreamingSystem/CSparseStreamingHandler class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, public core::Unmovable @@ -37,19 +32,10 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ // inline ILogicalDevice* getDevice() {return m_device.get();} - // - inline const uint32_t getMaxPropertiesPerTransferDispatch() {return m_maxPropertiesPerPass;} - - // - inline uint32_t getMaxScratchSize() const {return sizeof(nbl_glsl_property_pool_transfer_t)*m_maxPropertiesPerPass;} - // inline IGPUComputePipeline* getPipeline() {return m_pipeline.get();} inline const IGPUComputePipeline* getPipeline() const {return m_pipeline.get();} - // - inline const IGPUDescriptorSetLayout* getCanonicalLayout() const { return m_dsCache->getCanonicalLayout(); } - // struct TransferRequest { @@ -57,10 +43,11 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ enum E_FLAG : uint16_t { EF_NONE=0, - EF_DOWNLOAD=NBL_BUILTIN_PROPERTY_POOL_TRANSFER_EF_DOWNLOAD, + // this wasn't used anywhere in the hlsl + EF_DOWNLOAD=1, // this flag will make the `srcAddresses ? srcAddresses[0]:0` be used as the source address for all reads, effectively "filling" with uniform value - EF_FILL=NBL_BUILTIN_PROPERTY_POOL_TRANSFER_EF_SRC_FILL, - EF_BIT_COUNT=NBL_BUILTIN_PROPERTY_POOL_TRANSFER_EF_BIT_COUNT + EF_FILL=2, + EF_BIT_COUNT=3 }; // static inline constexpr uint32_t invalid_offset = ~0u; @@ -72,9 +59,6 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ elementSize = pool->getPropertySize(propertyID); } - // - inline bool isDownload() const {return flags&EF_DOWNLOAD;} - // inline uint32_t getSourceElementCount() const { @@ -87,21 +71,22 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ asset::SBufferRange memblock = {}; E_FLAG flags = EF_NONE; uint16_t elementSize = 0u; - uint32_t elementCount = 0u; + uint64_t elementCount = 0u; // the source or destination buffer depending on the transfer type asset::SBufferBinding buffer = {}; // can be invalid, if invalid, treated like an implicit {0,1,2,3,...} iota view - uint32_t srcAddressesOffset = IPropertyPool::invalid; - uint32_t dstAddressesOffset = IPropertyPool::invalid; + uint64_t srcAddressesOffset = IPropertyPool::invalid; + uint64_t dstAddressesOffset = IPropertyPool::invalid; }; // Fence must be not pending yet, `cmdbuf` must be already in recording state. [[nodiscard]] bool transferProperties( - IGPUCommandBuffer* const cmdbuf, IGPUFence* const fence, + IGPUCommandBuffer* const cmdbuf, //IGPUFence* const fence, const asset::SBufferBinding& scratch, const asset::SBufferBinding& addresses, const TransferRequest* const requestsBegin, const TransferRequest* const requestsEnd, system::logger_opt_ptr logger, const uint32_t baseDWORD=0u, const uint32_t endDWORD=~0ull ); +#if 0 // TODO: Up streaming requests // struct UpStreamingRequest { @@ -190,7 +175,10 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ uint32_t& waitSemaphoreCount, IGPUSemaphore* const*& semaphoresToWaitBeforeOverwrite, const asset::PIPELINE_STAGE_FLAGS*& stagesToWaitForPerSemaphore, system::logger_opt_ptr logger, const std::chrono::steady_clock::time_point& maxWaitPoint=std::chrono::steady_clock::now()+std::chrono::microseconds(500u) ); - +#endif + +// TODO: freeing properties +#if 0 // utility to help you fill out the tail move scatter request after the free, properly, returns if you actually need to transfer anything static inline bool freeProperties(IPropertyPool* pool, UpStreamingRequest* requests, const uint32_t* indicesBegin, const uint32_t* indicesEnd, uint32_t* srcAddresses, uint32_t* dstAddresses) { @@ -211,34 +199,21 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ } return false; } +#endif protected: ~CPropertyPoolHandler() {} - static inline constexpr auto MaxPropertiesPerDispatch = NBL_BUILTIN_PROPERTY_POOL_MAX_PROPERTIES_PER_DISPATCH; + static inline constexpr auto MaxPropertiesPerDispatch = nbl::hlsl::property_pools::MaxPropertiesPerDispatch; static inline constexpr auto DescriptorCacheSize = 128u; core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_pipeline; - // TODO: investigate using Push Descriptors for this - class TransferDescriptorSetCache : public IDescriptorSetCache - { - public: - using IDescriptorSetCache::IDescriptorSetCache; - - // - uint32_t acquireSet( - CPropertyPoolHandler* handler, const asset::SBufferBinding& scratch, const asset::SBufferBinding& addresses, - const TransferRequest* requests, const uint32_t propertyCount - ); - }; - core::smart_refctd_ptr m_dsCache; - uint16_t m_maxPropertiesPerPass; uint32_t m_alignment; }; -#endif } -#endif \ No newline at end of file + +#endif diff --git a/include/nbl/video/utilities/IPropertyPool.h b/include/nbl/video/utilities/IPropertyPool.h index 0f56df622e..86c4d02f47 100644 --- a/include/nbl/video/utilities/IPropertyPool.h +++ b/include/nbl/video/utilities/IPropertyPool.h @@ -11,6 +11,10 @@ #include "nbl/video/ILogicalDevice.h" #include "nbl/video/IGPUDescriptorSetLayout.h" +#include "glm/glm/glm.hpp" +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/property_pool/transfer.hlsl" + namespace nbl::video { @@ -21,8 +25,8 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted public: using PropertyAddressAllocator = core::PoolAddressAllocatorST; - static inline constexpr auto invalid = PropertyAddressAllocator::invalid_address; - + static inline constexpr uint64_t invalid = 0; + using value_type = PropertyAddressAllocator::size_type; // virtual const asset::SBufferRange& getPropertyMemoryBlock(uint32_t ix) const =0; @@ -34,19 +38,19 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted inline bool isContiguous() const {return m_indexToAddr;} // - inline uint32_t getAllocated() const + inline value_type getAllocated() const { return indexAllocator.get_allocated_size(); } // - inline uint32_t getFree() const + inline value_type getFree() const { return indexAllocator.get_free_size(); } // - inline uint32_t getCapacity() const + inline value_type getCapacity() const { // special case allows us to use `get_total_size`, because the pool allocator has no added offsets return indexAllocator.get_total_size(); @@ -217,8 +221,8 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted static bool validateBlocks(const ILogicalDevice* device, const uint32_t propertyCount, const size_t* propertySizes, const uint32_t capacity, const asset::SBufferRange* _memoryBlocks); PropertyAddressAllocator indexAllocator; - uint32_t* m_indexToAddr; - uint32_t* m_addrToIndex; + uint64_t* m_indexToAddr; + uint64_t* m_addrToIndex; }; diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 64789e7697..40f248303e 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -255,6 +255,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_shuffle. #stdlib LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/algorithm.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bit.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/functional.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/limits.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/type_traits.hlsl") @@ -299,4 +300,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/broadcast.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl") -ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL") \ No newline at end of file +# property pools +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/property_pool/transfer.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/property_pool/copy.comp.hlsl") + +ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL") diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index 40f17e4e75..1dcc82540e 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -5,11 +5,36 @@ using namespace nbl; using namespace video; -#if 0 // TODO: port // -CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr&& device) : m_device(std::move(device)), m_dsCache() +CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr&& device) : m_device(std::move(device)) { - // TODO: rewrite in HLSL! + auto system = m_device->getPhysicalDevice()->getSystem(); + // TODO: Reuse asset manager from elsewhere? + auto assetManager = core::make_smart_refctd_ptr(core::smart_refctd_ptr(system)); + + auto loadShader = [&](const char* path) + { + asset::IAssetLoader::SAssetLoadParams params = {}; + auto assetBundle = assetManager->getAsset(path, params); + auto assets = assetBundle.getContents(); + assert(!assets.empty()); + + auto cpuShader = asset::IAsset::castDown(assets[0]); + auto shader = m_device->createShader(cpuShader.get()); + return shader; + }; + auto shader = loadShader("../../../include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl"); + const asset::SPushConstantRange transferInfoPushConstants = { asset::IShader::ESS_COMPUTE,0u,sizeof(nbl::hlsl::property_pools::TransferDispatchInfo) }; + auto layout = m_device->createPipelineLayout({ &transferInfoPushConstants,1u }); + + { + video::IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = shader.get(); + + m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &m_pipeline); + } + #if 0 const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); m_maxPropertiesPerPass = core::min((deviceLimits.maxPerStageDescriptorSSBOs-2u)/2u,MaxPropertiesPerDispatch); @@ -59,15 +84,122 @@ CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr& scratch, const asset::SBufferBinding& addresses, const TransferRequest* const requestsBegin, const TransferRequest* const requestsEnd, - system::logger_opt_ptr logger, const uint32_t baseDWORD, const uint32_t endDWORD + system::logger_opt_ptr logger, const uint32_t baseOffsetBytes, const uint32_t endOffsetBytes ) { - assert(false); // TODO: Atil + if (requestsBegin==requestsEnd) + return true; + if (!scratch.buffer || !scratch.buffer->getCreationParams().usage.hasFlags(IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF)) + { + logger.log("CPropertyPoolHandler: Need a valid scratch buffer which can have updates staged from the commandbuffer!",system::ILogger::ELL_ERROR); + return false; + } + // TODO: validate usage flags + uint32_t maxScratchSize = MaxPropertiesPerDispatch * sizeof(nbl::hlsl::property_pools::TransferRequest); + if (scratch.offset + maxScratchSize > scratch.buffer->getSize()) + logger.log("CPropertyPoolHandler: The scratch buffer binding provided might not be big enough in the worst case! (Scratch buffer size: %i Max scratch size: %i)", + system::ILogger::ELL_WARNING, + scratch.buffer->getSize() - scratch.offset, + maxScratchSize); + + const auto totalProps = std::distance(requestsBegin,requestsEnd); + bool success = true; + + uint32_t numberOfPasses = totalProps / MaxPropertiesPerDispatch; + nbl::hlsl::property_pools::TransferRequest transferRequestsData[MaxPropertiesPerDispatch]; + uint64_t scratchBufferDeviceAddr = scratch.buffer.get()->getDeviceAddress() + scratch.offset; + uint64_t addressBufferDeviceAddr = addresses.buffer.get()->getDeviceAddress() + addresses.offset; + + for (uint32_t transferPassRequestsIndex = 0; transferPassRequestsIndex < totalProps; transferPassRequestsIndex += MaxPropertiesPerDispatch) + { + const TransferRequest* transferPassRequests = requestsBegin + transferPassRequestsIndex; + uint32_t requestsThisPass = core::min(std::distance(transferPassRequests, requestsEnd), MaxPropertiesPerDispatch); + uint64_t maxElements = 0; + for (uint32_t i = 0; i < requestsThisPass; i ++) + { + auto& transferRequest = transferRequestsData[i]; + auto srcRequest = transferPassRequests + i; + transferRequest.srcAddr = srcRequest->memblock.buffer.get()->getDeviceAddress() + srcRequest->memblock.offset; + transferRequest.dstAddr = srcRequest->buffer.buffer.get()->getDeviceAddress() + srcRequest->buffer.offset; + transferRequest.srcIndexAddr = srcRequest->srcAddressesOffset != IPropertyPool::invalid ? addressBufferDeviceAddr + srcRequest->srcAddressesOffset : 0; + transferRequest.dstIndexAddr = srcRequest->dstAddressesOffset != IPropertyPool::invalid ? addressBufferDeviceAddr + srcRequest->dstAddressesOffset : 0; + transferRequest.elementCount32 = uint32_t(srcRequest->elementCount & (uint64_t(1) << 32) - 1); + transferRequest.elementCountExtra = uint32_t(srcRequest->elementCount >> 32); + transferRequest.propertySize = srcRequest->elementSize; + transferRequest.fill = 0; // TODO + transferRequest.srcIndexSizeLog2 = 1u; // TODO + transferRequest.dstIndexSizeLog2 = 1u; // TODO + if (getAlignment(transferRequest.srcAddr) != 0) + { + logger.log("CPropertyPoolHandler: memblock.buffer BDA address %I64i is not aligned to 8 byte (64 bit)",system::ILogger::ELL_ERROR, transferRequest.srcAddr); + } + if (getAlignment(transferRequest.dstAddr) != 0) + { + logger.log("CPropertyPoolHandler: buffer.buffer BDA address %I64i is not aligned to 8 byte (64 bit)",system::ILogger::ELL_ERROR, transferRequest.dstAddr); + } + if (getAlignment(transferRequest.propertySize) != 0) + { + logger.log("CPropertyPoolHandler: propertySize %i is not aligned to 8 byte (64 bit)",system::ILogger::ELL_ERROR, srcRequest->elementSize); + } + if (transferRequest.srcIndexSizeLog2 < 1 || transferRequest.srcIndexSizeLog2 > 3) + { + auto srcIndexSizeLog2 = transferRequest.srcIndexSizeLog2; + logger.log("CPropertyPoolHandler: srcIndexSizeLog2 %i (%i bit values) are unsupported",system::ILogger::ELL_ERROR, srcIndexSizeLog2, (1 << transferRequest.srcIndexSizeLog2) * sizeof(uint8_t)); + } + if (transferRequest.dstIndexSizeLog2 < 1 || transferRequest.dstIndexSizeLog2 > 3) + { + auto dstIndexSizeLog2 = transferRequest.dstIndexSizeLog2; + logger.log("CPropertyPoolHandler: dstIndexSizeLog2 %i (%i bit values) are unsupported",system::ILogger::ELL_ERROR, dstIndexSizeLog2, (1 << transferRequest.srcIndexSizeLog2) * sizeof(uint8_t)); + } + + maxElements = core::max(maxElements, srcRequest->elementCount); + } + cmdbuf->updateBuffer({ scratch.offset,sizeof(TransferRequest) * requestsThisPass, core::smart_refctd_ptr(scratch.buffer) }, transferRequestsData); + + const asset::SMemoryBarrier barriers[1] = { { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = asset::ACCESS_FLAGS::SHADER_READ_BITS + } }; + cmdbuf->pipelineBarrier(asset::EDF_NONE,IGPUCommandBuffer::SPipelineBarrierDependencyInfo{ + .memBarriers = barriers + // TODO: .bufBarriers = instead + }); + + cmdbuf->bindComputePipeline(m_pipeline.get()); + + nbl::hlsl::property_pools::TransferDispatchInfo pushConstants; + { + // TODO: Should the offset bytes be handled elsewhere? + pushConstants.beginOffset = baseOffsetBytes; + pushConstants.endOffset = endOffsetBytes; + pushConstants.transferCommandsAddress = scratchBufferDeviceAddr + transferPassRequestsIndex * sizeof(TransferRequest); + } + assert(getAlignment(scratchBufferDeviceAddr) == 0); + assert(getAlignment(sizeof(TransferRequest)) == 0); + cmdbuf->pushConstants(m_pipeline->getLayout(), asset::IShader::ESS_COMPUTE, 0u, sizeof(pushConstants), &pushConstants); + + // dispatch + { + const auto& limits = m_device->getPhysicalDevice()->getLimits(); + const auto invocationCoarseness = limits.maxOptimallyResidentWorkgroupInvocations * requestsThisPass; + const auto dispatchElements = (maxElements - 1) / requestsThisPass + 1; + cmdbuf->dispatch(limits.computeOptimalPersistentWorkgroupDispatchSize(dispatchElements,invocationCoarseness), requestsThisPass, 1u); + } + // TODO: pipeline barrier + } + + return success; #if 0 if (requestsBegin==requestsEnd) return true; @@ -186,6 +318,8 @@ bool CPropertyPoolHandler::transferProperties( #endif } +#if 0 // TODO: up streaming requests + uint32_t CPropertyPoolHandler::transferProperties( StreamingTransientDataBufferMT<>* const upBuff, IGPUCommandBuffer* const cmdbuf, IGPUFence* const fence, IQueue* const queue, const asset::SBufferBinding& scratch, UpStreamingRequest* &requests, const uint32_t requestCount, @@ -534,69 +668,5 @@ uint32_t CPropertyPoolHandler::transferProperties( return 0u; } -uint32_t CPropertyPoolHandler::TransferDescriptorSetCache::acquireSet( - CPropertyPoolHandler* handler, const asset::SBufferBinding& scratch, const asset::SBufferBinding& addresses, - const TransferRequest* requests, const uint32_t propertyCount -) -{ - auto retval = IDescriptorSetCache::acquireSet(); - if (retval==IDescriptorSetCache::invalid_index) - return IDescriptorSetCache::invalid_index; - - - auto device = handler->getDevice(); - const auto maxPropertiesPerPass = handler->getMaxPropertiesPerTransferDispatch(); - - - IGPUDescriptorSet::SDescriptorInfo infos[MaxPropertiesPerDispatch*2u+2u]; - infos[0] = scratch; - infos[0].info.buffer.size = sizeof(nbl_glsl_property_pool_transfer_t)*propertyCount; - infos[1] = addresses; - auto* inDescInfo = infos+2; - auto* outDescInfo = infos+2+maxPropertiesPerPass; - for (uint32_t i=0u; iupdateDescriptorSets(4u, writes, 0u, nullptr); - - return retval; -} -#endif \ No newline at end of file diff --git a/src/nbl/video/utilities/IPropertyPool.cpp b/src/nbl/video/utilities/IPropertyPool.cpp index 2aec9387f8..683954ee55 100644 --- a/src/nbl/video/utilities/IPropertyPool.cpp +++ b/src/nbl/video/utilities/IPropertyPool.cpp @@ -20,7 +20,7 @@ IPropertyPool::IPropertyPool(uint32_t capacity, void* reserved, bool contiguous) { if (contiguous) { - m_indexToAddr = reinterpret_cast(reinterpret_cast(reserved)+getReservedSize(capacity)); + m_indexToAddr = reinterpret_cast(reinterpret_cast(reserved)+getReservedSize(capacity)); m_addrToIndex = m_indexToAddr+capacity; std::fill_n(m_indexToAddr,capacity,invalid);