diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl
new file mode 100644
index 0000000000..b252b34379
--- /dev/null
+++ b/include/nbl/builtin/hlsl/concepts.hlsl
@@ -0,0 +1,88 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
+#include <nbl/builtin/hlsl/type_traits.hlsl>
+
+
+#if (__cplusplus >= 202002L && __cpp_concepts)
+
+#define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__>
+#define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__)
+#define NBL_CONCEPT_BODY(...) { __VA_ARGS__ };
+#define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__;
+#define NBL_REQUIRES(...) requires __VA_ARGS__ 
+
+#include <concepts>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace concepts
+{
+
+// Alias some of the std concepts in nbl. As this is C++20 only, we don't need to use
+// the macros here.
+template <typename T, typename U>
+concept same_as = std::same_as<T, U>;
+
+template <typename D, typename B>
+concept derived_from = std::derived_from<D, B>;
+
+template <typename F, typename T>
+concept convertible_to = std::convertible_to<F, T>;
+
+template <typename T, typename F>
+concept assignable_from = std::assignable_from<T, F>;
+
+template <typename T, typename U>
+concept common_with = std::common_with<T, U>;
+
+template <typename T>
+concept integral = std::integral<T>;
+
+template <typename T>
+concept signed_integral = std::signed_integral<T>;
+
+template <typename T>
+concept unsigned_integral = std::unsigned_integral<T>;
+
+template <typename T>
+concept floating_point = std::floating_point<T>;
+
+
+// Some other useful concepts.
+
+template<typename T, typename... Ts>
+concept any_of = (same_as<T, Ts> || ...);
+
+template <typename T>
+concept scalar = floating_point<T> || integral<T>;
+
+template <typename T>
+concept vectorial = is_vector<T>::value;
+
+template <typename T>
+concept matricial = is_matrix<T>::value;
+
+}
+}
+}
+
+#else
+
+// No C++20 support. Do nothing.
+#define NBL_CONCEPT_TYPE_PARAMS(...)
+#define NBL_CONCEPT_SIGNATURE(NAME, ...) 
+#define NBL_CONCEPT_BODY(...)
+#define NBL_REQUIRES(...)
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl
new file mode 100644
index 0000000000..5d9bce06da
--- /dev/null
+++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl
@@ -0,0 +1,167 @@
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/property_pool/transfer.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace property_pools
+{
+
+[[vk::push_constant]] TransferDispatchInfo globals;
+
+template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2, uint64_t DstIndexSizeLog2>
+struct TransferLoop
+{
+    void iteration(uint propertyId, TransferRequest transferRequest, uint64_t invocationIndex)
+    {
+        const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2;
+        const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2;
+
+        // Fill: Always use offset 0 on src
+        const uint64_t srcOffset = Fill ? 0 : invocationIndex * transferRequest.propertySize;
+        const uint64_t dstOffset = invocationIndex * transferRequest.propertySize;
+        
+        // IOTA: Use the index as the fetching offset
+        // Non IOTA: Read the address buffer ("index buffer") to select fetching offset
+        uint64_t srcAddressBufferOffset;
+        uint64_t dstAddressBufferOffset;
+
+        if (SrcIndexIota) srcAddressBufferOffset = srcOffset;
+        else 
+        {
+            if (SrcIndexSizeLog2 == 0) {} // we can't read individual byte
+            else if (SrcIndexSizeLog2 == 1) srcAddressBufferOffset = vk::RawBufferLoad<uint16_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint16_t));
+            else if (SrcIndexSizeLog2 == 2) srcAddressBufferOffset = vk::RawBufferLoad<uint32_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint32_t));
+            else if (SrcIndexSizeLog2 == 3) srcAddressBufferOffset = vk::RawBufferLoad<uint64_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint64_t));
+        }
+
+        if (DstIndexIota) dstAddressBufferOffset = dstOffset;
+        else 
+        {
+            if (DstIndexSizeLog2 == 0) {} // we can't read individual byte
+            else if (DstIndexSizeLog2 == 1) dstAddressBufferOffset = vk::RawBufferLoad<uint16_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint16_t));
+            else if (DstIndexSizeLog2 == 2) dstAddressBufferOffset = vk::RawBufferLoad<uint32_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint32_t));
+            else if (DstIndexSizeLog2 == 3) dstAddressBufferOffset = vk::RawBufferLoad<uint64_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint64_t));
+        }
+
+        const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize; 
+        const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize; 
+
+        vk::RawBufferStore<uint32_t>(dstAddressMapped, vk::RawBufferLoad<uint32_t>(srcAddressMapped));
+    }
+
+    void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
+    {
+        uint64_t elementCount = uint64_t(transferRequest.elementCount32)
+            | uint64_t(transferRequest.elementCountExtra) << 32;
+        uint64_t lastInvocation = min(elementCount, dispatchInfo.endOffset);
+        for (uint64_t invocationIndex = dispatchInfo.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize)
+        {
+            iteration(propertyId, transferRequest, invocationIndex);
+        }
+    }
+};
+
+// For creating permutations of the functions based on parameters that are constant over the transfer request
+// These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any
+// branching within them
+// 
+// Permutations:
+// 2 (fill or not) * 2 (src index iota or not) * 2 (dst index iota or not) * 4 (src index size) * 4 (dst index size)
+// Total amount of permutations: 128
+
+template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2>
+struct TransferLoopPermutationSrcIndexSizeLog
+{
+    void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
+    {
+       if (transferRequest.dstIndexSizeLog2 == 0) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 0> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else if (transferRequest.dstIndexSizeLog2 == 1) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 1> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else if (transferRequest.dstIndexSizeLog2 == 2) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 2> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 3> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+    }
+};
+
+template<bool Fill, bool SrcIndexIota, bool DstIndexIota>
+struct TransferLoopPermutationDstIota
+{
+    void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
+    {
+       if (transferRequest.srcIndexSizeLog2 == 0) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 0> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else if (transferRequest.srcIndexSizeLog2 == 1) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 1> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else if (transferRequest.srcIndexSizeLog2 == 2) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 2> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 3> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+    }
+};
+
+template<bool Fill, bool SrcIndexIota>
+struct TransferLoopPermutationSrcIota
+{
+    void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
+    {
+        bool dstIota = transferRequest.dstIndexAddr == 0;
+        if (dstIota) { TransferLoopPermutationDstIota<Fill, SrcIndexIota, true> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+        else { TransferLoopPermutationDstIota<Fill, SrcIndexIota, false> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+    }
+};
+
+template<bool Fill>
+struct TransferLoopPermutationFill
+{
+    void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
+    {
+        bool srcIota = transferRequest.srcIndexAddr == 0;
+        if (srcIota) { TransferLoopPermutationSrcIota<Fill, true> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+        else { TransferLoopPermutationSrcIota<Fill, false> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+    }
+};
+
+// Loading transfer request from the pointer (can't use struct
+// with BDA on HLSL SPIRV)
+static TransferRequest TransferRequest::newFromAddress(const uint64_t transferCmdAddr)
+{   
+    TransferRequest transferRequest;
+    transferRequest.srcAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr,8);
+    transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t),8);
+    transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 2,8);
+    transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 3,8);
+    // Remaining elements are part of the same bitfield
+    // TODO: Do this only using raw buffer load?
+    uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 4,8);
+    transferRequest.elementCount32 = uint32_t(bitfieldType);
+    transferRequest.elementCountExtra = uint32_t(bitfieldType >> 32);
+    transferRequest.propertySize = uint32_t(bitfieldType >> (32 + 3));
+    transferRequest.fill = uint32_t(bitfieldType >> (32 + 3 + 24));
+    transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1));
+    transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1 + 2));
+
+    return transferRequest;
+}
+
+template<typename device_capabilities>
+void main(uint32_t3 dispatchId, const uint dispatchSize)
+{
+    const uint propertyId = dispatchId.y;
+    const uint invocationIndex = dispatchId.x;
+
+    uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof(TransferRequest) * propertyId;
+    TransferRequest transferRequest = TransferRequest::newFromAddress(transferCmdAddr);
+
+    const bool fill = transferRequest.fill == 1;
+
+    if (fill) { TransferLoopPermutationFill<true> loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); }
+    else { TransferLoopPermutationFill<false> loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); }
+}
+
+}
+}
+}
+
+[numthreads(nbl::hlsl::property_pools::OptimalDispatchSize,1,1)]
+void main(uint32_t3 dispatchId : SV_DispatchThreadID)
+{
+    nbl::hlsl::property_pools::main<nbl::hlsl::jit::device_capabilities>(dispatchId, nbl::hlsl::property_pools::OptimalDispatchSize);
+}
+
diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl
new file mode 100644
index 0000000000..d83a3453c7
--- /dev/null
+++ b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl
@@ -0,0 +1,63 @@
+#ifndef _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_
+#define _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace property_pools
+{
+
+struct TransferRequest
+{
+    // This represents a transfer command/request
+    uint64_t srcAddr;
+    uint64_t dstAddr;
+    uint64_t srcIndexAddr; // IOTA default
+    uint64_t dstIndexAddr; // IOTA default
+    // TODO: go back to this ideal layout when things work
+    // (Getting a fatal error from DXC when using 64-bit bitfields:)
+    // fatal error: generated SPIR-V is invalid: [VUID-StandaloneSpirv-Base-04781] Expected 32-bit int type for Base operand: BitFieldInsert
+    // %58 = OpBitFieldInsert %ulong %42 %57 %uint_0 %uint_35
+    //
+    //uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers
+    //uint64_t propertySize : 24; // all the leftover bits (just use bytes now)
+    //uint64_t fill : 1;
+    //// 0=uint8, 1=uint16, 2=uint32, 3=uint64
+    //uint64_t srcIndexSizeLog2 : 2;
+    //uint64_t dstIndexSizeLog2 : 2;
+    uint32_t elementCount32; // 32 first bits
+    uint32_t elementCountExtra : 3; // 3 last bits
+    uint32_t propertySize : 24;
+    uint32_t fill: 1;
+    uint32_t srcIndexSizeLog2 : 2;
+    uint32_t dstIndexSizeLog2 : 2;
+    
+    // Reads a TransferRequest from a BDA
+    static TransferRequest newFromAddress(const uint64_t address);
+};
+
+struct TransferDispatchInfo 
+{
+    // BDA address (GPU pointer) into the transfer commands buffer
+    uint64_t transferCommandsAddress;
+    // Define the range of invocations (X axis) that will be transfered over in this dispatch
+    // May be sectioned off in the case of overflow or any other situation that doesn't allow
+    // for a full transfer
+    uint64_t beginOffset;
+    uint64_t endOffset;
+};
+
+NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128;
+
+// TODO: instead use some sort of replace function for getting optimal size?
+NBL_CONSTEXPR uint32_t OptimalDispatchSize = 256;
+
+}
+}
+}
+
+#endif
+
diff --git a/include/nbl/core/alloc/address_allocator_traits.h b/include/nbl/core/alloc/address_allocator_traits.h
index 293dc3503e..f6c522bc53 100644
--- a/include/nbl/core/alloc/address_allocator_traits.h
+++ b/include/nbl/core/alloc/address_allocator_traits.h
@@ -53,6 +53,18 @@ namespace nbl::core
                 }
             }
 
+            static inline void         multi_alloc_addr(AddressAlloc& alloc, uint32_t count, size_type* outAddresses, const size_type* bytes,
+                                                        const size_type alignment, const size_type* hint=nullptr) noexcept
+            {
+                for (uint32_t i=0; i<count; i++)
+                {
+                    if (outAddresses[i]!=AddressAlloc::invalid_address)
+                        continue;
+
+                    outAddresses[i] = alloc.alloc_addr(bytes[i],alignment,hint ? hint[i]:0ull);
+                }
+            }
+
             static inline void         multi_free_addr(AddressAlloc& alloc, uint32_t count, const size_type* addr, const size_type* bytes) noexcept
             {
                 for (uint32_t i=0; i<count; i++)
@@ -186,6 +198,14 @@ namespace nbl::core
                                                                 alloc,std::min(count-i,maxMultiOps),outAddresses+i,bytes+i,alignment+i,hint ? (hint+i):nullptr);
             }
 
+            static inline void              multi_alloc_addr(AddressAlloc& alloc, uint32_t count, size_type* outAddresses,
+                                                             const size_type* bytes, const size_type alignment, const size_type* hint=nullptr) noexcept
+            {
+                for (uint32_t i=0; i<count; i+=maxMultiOps)
+                    impl::address_allocator_traits_base<AddressAlloc,has_func_multi_alloc_addr<AddressAlloc>::value>::multi_alloc_addr(
+                                                                alloc,std::min(count-i,maxMultiOps),outAddresses+i,bytes+i,alignment,hint ? (hint+i):nullptr);
+            }
+
             static inline void             multi_free_addr(AddressAlloc& alloc, uint32_t count, const size_type* addr, const size_type* bytes) noexcept
             {
                 for (uint32_t i=0; i<count; i+=maxMultiOps)
diff --git a/include/nbl/scene/ITransformTreeManager.h b/include/nbl/scene/ITransformTreeManager.h
index 6152c32481..15353d4150 100644
--- a/include/nbl/scene/ITransformTreeManager.h
+++ b/include/nbl/scene/ITransformTreeManager.h
@@ -254,6 +254,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted
 			return true;
 		}
 		
+#if 0 // TODO: upstreaming cpropertypoolhandler
 		//
 		struct UpstreamRequestBase : RequestBase
 		{
@@ -497,6 +498,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted
 				waitSemaphoreCount,semaphoresToWaitBeforeOverwrite,stagesToWaitForPerSemaphore,request.logger,maxWaitPoint
 			);
 		}
+#endif
 
 
 		 
diff --git a/include/nbl/video/alloc/SubAllocatedDescriptorSet.h b/include/nbl/video/alloc/SubAllocatedDescriptorSet.h
new file mode 100644
index 0000000000..e60fda5376
--- /dev/null
+++ b/include/nbl/video/alloc/SubAllocatedDescriptorSet.h
@@ -0,0 +1,119 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_VIDEO_SUB_ALLOCATED_DESCRIPTOR_SET_H_
+#define _NBL_VIDEO_SUB_ALLOCATED_DESCRIPTOR_SET_H
+
+#include "nbl/video/alloc/IBufferAllocator.h"
+
+#include <type_traits>
+
+namespace nbl::video
+{
+
+// address allocator gives offsets
+// reserved allocator allocates memory to keep the address allocator state inside
+template<class AddrAllocator, class ReservAllocator=core::allocator<uint8_t>>
+class SubAllocatedDescriptorSet : public core::IReferenceCounted 
+{
+  public:
+        using AddressAllocator = AddrAllocator;
+        using ReservedAllocator = ReservAllocator;
+        using size_type = typename AddressAllocator::size_type;
+        using value_type = typename AddressAllocator::size_type;
+        static constexpr value_type invalid_value = AddressAllocator::invalid_address;
+
+        // constructors
+        template<typename... Args>
+        inline SubAllocatedDescriptorSet(const std::span<const video::IGPUDescriptorSetLayout::SBinding> bindings, 
+            ReservedAllocator&& _reservedAllocator, const value_type maxAllocatableAlignment, Args&&... args)
+        {
+            auto allocatableDescriptors = 0;
+            m_allocatableRanges.reserve(bindings.size());
+
+            for (auto& binding : bindings)
+            {
+                SubAllocDescriptorSetRange range;
+                range.offset = allocatableDescriptors;
+                range.binding = binding;
+                // Only bindings with these flags will be allocatable
+                if (binding.createFlags.hasFlags(core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) 
+					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT 
+					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT))
+                {
+                    allocatableDescriptors += binding.count;
+                }
+				m_allocatableRanges.push_back(range);
+            }
+
+			m_addressAllocator = AddrAllocator(
+				_reservedAllocator.allocate(AddressAllocator::reserved_size(maxAllocatableAlignment, static_cast<size_type>(allocatableDescriptors), args...), _NBL_SIMD_ALIGNMENT),
+				static_cast<size_type>(0), 0u, maxAllocatableAlignment, static_cast<size_type>(allocatableDescriptors), std::forward<Args>(args)...
+			);
+			m_reservedAllocator = ReservedAllocator(std::move(_reservedAllocator));
+            m_reservedSize = allocatableDescriptors;
+        }
+        // version with default constructed reserved allocator
+        template<typename... Args>
+        explicit inline SubAllocatedDescriptorSet(const std::span<const video::IGPUDescriptorSetLayout::SBinding> bindings, 
+            const value_type maxAllocatableAlignment, Args&&... args) :
+            SubAllocatedDescriptorSet(bindings,ReservedAllocator(),maxAllocatableAlignment,std::forward<Args>(args)...)
+        {
+        }
+        ~SubAllocatedDescriptorSet()
+        {
+            auto ptr = reinterpret_cast<const uint8_t*>(core::address_allocator_traits<AddressAllocator>::getReservedSpacePtr(m_addressAllocator));
+            m_reservedAllocator.deallocate(const_cast<uint8_t*>(ptr),m_reservedSize);
+        }
+
+        // anyone gonna use it?
+        inline const AddressAllocator& getAddressAllocator() const {return m_addressAllocator;}
+
+        //
+        inline ReservedAllocator& getReservedAllocator() {return m_reservedAllocator;}
+
+        // main methods
+
+        //! Warning `outAddresses` needs to be primed with `invalid_value` values, otherwise no allocation happens for elements not equal to `invalid_value`
+        template<typename... Args>
+        inline void multi_allocate(uint32_t count, value_type* outAddresses, const size_type* sizes, const Args&... args)
+        {
+            core::address_allocator_traits<AddressAllocator>::multi_alloc_addr(m_addressAllocator,count,outAddresses,sizes,1,args...);
+        }
+        inline void multi_deallocate(uint32_t count, const size_type* addr, const size_type* sizes)
+        {
+            core::address_allocator_traits<AddressAllocator>::multi_free_addr(m_addressAllocator,count,addr,sizes);
+        }
+
+        // to conform to IBufferAllocator concept
+        template<typename... Args>
+        inline value_type allocate(const size_type bytes, const size_type alignment, const Args&... args)
+        {
+            value_type retval = invalid_value;
+            multi_allocate(&retval,&bytes,&alignment,args...);
+            return retval;
+        }
+        template<typename... Args>
+        inline void deallocate(value_type& allocation, Args&&... args)
+        {
+            multi_deallocate(std::forward<Args>(args)...);
+            allocation = invalid_value;
+        }
+
+    protected:
+        AddressAllocator                    m_addressAllocator;
+        ReservedAllocator                   m_reservedAllocator;
+        size_t                              m_reservedSize; // FIXME: uninitialized variable
+
+        struct SubAllocDescriptorSetRange {
+            uint32_t offset;
+            video::IGPUDescriptorSetLayout::SBinding binding;
+        };
+        std::vector<SubAllocDescriptorSetRange> m_allocatableRanges = {};
+};
+
+}
+
+#endif
+
diff --git a/include/nbl/video/utilities/CPropertyPoolHandler.h b/include/nbl/video/utilities/CPropertyPoolHandler.h
index b4423b0e0b..f1a6e6da2e 100644
--- a/include/nbl/video/utilities/CPropertyPoolHandler.h
+++ b/include/nbl/video/utilities/CPropertyPoolHandler.h
@@ -12,18 +12,13 @@
 #include "nbl/video/utilities/IDescriptorSetCache.h"
 #include "nbl/video/utilities/IPropertyPool.h"
 
+#include "glm/glm/glm.hpp"
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/property_pool/transfer.hlsl"
 
 namespace nbl::video
 {
 
-#if 0 // TODO: port
-#define int int32_t
-#define uint uint32_t
-#include "nbl/builtin/glsl/property_pool/transfer.glsl"
-#undef uint
-#undef int
-static_assert(NBL_BUILTIN_PROPERTY_POOL_INVALID==IPropertyPool::invalid);
-
 // property pool factory is externally synchronized
 // TODO: could rename to CSparseStreamingSystem/CSparseStreamingHandler
 class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, public core::Unmovable
@@ -37,19 +32,10 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ
 		//
 		inline ILogicalDevice* getDevice() {return m_device.get();}
 
-		//
-		inline const uint32_t getMaxPropertiesPerTransferDispatch() {return m_maxPropertiesPerPass;}
-
-		//
-		inline uint32_t getMaxScratchSize() const {return sizeof(nbl_glsl_property_pool_transfer_t)*m_maxPropertiesPerPass;}
-
         //
 		inline IGPUComputePipeline* getPipeline() {return m_pipeline.get();}
 		inline const IGPUComputePipeline* getPipeline() const {return m_pipeline.get();}
 
-        //
-		inline const IGPUDescriptorSetLayout* getCanonicalLayout() const { return m_dsCache->getCanonicalLayout(); }
-
         //
 		struct TransferRequest
 		{
@@ -57,10 +43,11 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ
 			enum E_FLAG : uint16_t
 			{
 				EF_NONE=0,
-				EF_DOWNLOAD=NBL_BUILTIN_PROPERTY_POOL_TRANSFER_EF_DOWNLOAD,
+				// this wasn't used anywhere in the hlsl
+				EF_DOWNLOAD=1,
 				// this flag will make the `srcAddresses ? srcAddresses[0]:0` be used as the source address for all reads, effectively "filling" with uniform value
-				EF_FILL=NBL_BUILTIN_PROPERTY_POOL_TRANSFER_EF_SRC_FILL,
-				EF_BIT_COUNT=NBL_BUILTIN_PROPERTY_POOL_TRANSFER_EF_BIT_COUNT
+				EF_FILL=2,
+				EF_BIT_COUNT=3
 			};
 			//
 			static inline constexpr uint32_t invalid_offset = ~0u;
@@ -72,9 +59,6 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ
 				elementSize = pool->getPropertySize(propertyID);
 			}
 
-			//
-			inline bool isDownload() const {return flags&EF_DOWNLOAD;}
-
 			//
 			inline uint32_t getSourceElementCount() const
 			{
@@ -87,21 +71,22 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ
 			asset::SBufferRange<IGPUBuffer> memblock = {};
 			E_FLAG flags = EF_NONE;
 			uint16_t elementSize = 0u;
-			uint32_t elementCount = 0u;
+			uint64_t elementCount = 0u;
 			// the source or destination buffer depending on the transfer type
 			asset::SBufferBinding<video::IGPUBuffer> buffer = {};
 			// can be invalid, if invalid, treated like an implicit {0,1,2,3,...} iota view
-			uint32_t srcAddressesOffset = IPropertyPool::invalid;
-			uint32_t dstAddressesOffset = IPropertyPool::invalid;
+			uint64_t srcAddressesOffset = IPropertyPool::invalid;
+			uint64_t dstAddressesOffset = IPropertyPool::invalid;
 		};
 		// Fence must be not pending yet, `cmdbuf` must be already in recording state.
 		[[nodiscard]] bool transferProperties(
-			IGPUCommandBuffer* const cmdbuf, IGPUFence* const fence,
+			IGPUCommandBuffer* const cmdbuf, //IGPUFence* const fence,
 			const asset::SBufferBinding<video::IGPUBuffer>& scratch, const asset::SBufferBinding<video::IGPUBuffer>& addresses,
 			const TransferRequest* const requestsBegin, const TransferRequest* const requestsEnd,
 			system::logger_opt_ptr logger, const uint32_t baseDWORD=0u, const uint32_t endDWORD=~0ull
 		);
 
+#if 0 // TODO: Up streaming requests
 		//
 		struct UpStreamingRequest
 		{
@@ -190,7 +175,10 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ
 			uint32_t& waitSemaphoreCount, IGPUSemaphore* const*& semaphoresToWaitBeforeOverwrite, const asset::PIPELINE_STAGE_FLAGS*& stagesToWaitForPerSemaphore,
 			system::logger_opt_ptr logger, const std::chrono::steady_clock::time_point& maxWaitPoint=std::chrono::steady_clock::now()+std::chrono::microseconds(500u)
 		);
-
+#endif
+		
+// TODO: freeing properties
+#if 0
 		// utility to help you fill out the tail move scatter request after the free, properly, returns if you actually need to transfer anything
 		static inline bool freeProperties(IPropertyPool* pool, UpStreamingRequest* requests, const uint32_t* indicesBegin, const uint32_t* indicesEnd, uint32_t* srcAddresses, uint32_t* dstAddresses)
 		{
@@ -211,34 +199,21 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ
 			}
 			return false;
 		}
+#endif
 
     protected:
 		~CPropertyPoolHandler() {}
 
-		static inline constexpr auto MaxPropertiesPerDispatch = NBL_BUILTIN_PROPERTY_POOL_MAX_PROPERTIES_PER_DISPATCH;
+		static inline constexpr auto MaxPropertiesPerDispatch = nbl::hlsl::property_pools::MaxPropertiesPerDispatch;
 		static inline constexpr auto DescriptorCacheSize = 128u;
 
 
 		core::smart_refctd_ptr<ILogicalDevice> m_device;
 		core::smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
-		// TODO: investigate using Push Descriptors for this
-		class TransferDescriptorSetCache : public IDescriptorSetCache
-		{
-			public:
-				using IDescriptorSetCache::IDescriptorSetCache;
-
-				//
-				uint32_t acquireSet(
-					CPropertyPoolHandler* handler, const asset::SBufferBinding<video::IGPUBuffer>& scratch, const asset::SBufferBinding<video::IGPUBuffer>& addresses,
-					const TransferRequest* requests, const uint32_t propertyCount
-				);
-		};
-		core::smart_refctd_ptr<TransferDescriptorSetCache> m_dsCache;
 
-		uint16_t m_maxPropertiesPerPass;
 		uint32_t m_alignment;
 };
-#endif
 
 }
-#endif
\ No newline at end of file
+
+#endif
diff --git a/include/nbl/video/utilities/IPropertyPool.h b/include/nbl/video/utilities/IPropertyPool.h
index 0f56df622e..86c4d02f47 100644
--- a/include/nbl/video/utilities/IPropertyPool.h
+++ b/include/nbl/video/utilities/IPropertyPool.h
@@ -11,6 +11,10 @@
 #include "nbl/video/ILogicalDevice.h"
 #include "nbl/video/IGPUDescriptorSetLayout.h"
 
+#include "glm/glm/glm.hpp"
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/property_pool/transfer.hlsl"
+
 namespace nbl::video
 {
 
@@ -21,8 +25,8 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted
 	public:
 		using PropertyAddressAllocator = core::PoolAddressAllocatorST<uint32_t>;
 
-        static inline constexpr auto invalid = PropertyAddressAllocator::invalid_address;
-
+        static inline constexpr uint64_t invalid = 0; 
+        using value_type = PropertyAddressAllocator::size_type;
 		//
         virtual const asset::SBufferRange<IGPUBuffer>& getPropertyMemoryBlock(uint32_t ix) const =0;
 
@@ -34,19 +38,19 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted
         inline bool isContiguous() const {return m_indexToAddr;}
 
         //
-        inline uint32_t getAllocated() const
+        inline value_type getAllocated() const
         {
             return indexAllocator.get_allocated_size();
         }
 
         //
-        inline uint32_t getFree() const
+        inline value_type getFree() const
         {
             return indexAllocator.get_free_size();
         }
 
         //
-        inline uint32_t getCapacity() const
+        inline value_type getCapacity() const
         {
             // special case allows us to use `get_total_size`, because the pool allocator has no added offsets
             return indexAllocator.get_total_size();
@@ -217,8 +221,8 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted
         static bool validateBlocks(const ILogicalDevice* device, const uint32_t propertyCount, const size_t* propertySizes, const uint32_t capacity, const asset::SBufferRange<IGPUBuffer>* _memoryBlocks);
 
         PropertyAddressAllocator indexAllocator;
-        uint32_t* m_indexToAddr;
-        uint32_t* m_addrToIndex;
+        uint64_t* m_indexToAddr;
+        uint64_t* m_addrToIndex;
 };
 
 
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 64789e7697..40f248303e 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -255,6 +255,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_shuffle.
 #stdlib
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/algorithm.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bit.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/functional.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/limits.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/type_traits.hlsl")
@@ -299,4 +300,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/broadcast.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl")
 
-ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL")
\ No newline at end of file
+# property pools
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/property_pool/transfer.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/property_pool/copy.comp.hlsl")
+
+ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL")
diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp
index 40f17e4e75..1dcc82540e 100644
--- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp
+++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp
@@ -5,11 +5,36 @@
 using namespace nbl;
 using namespace video;
 
-#if 0 // TODO: port
 //
-CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr<ILogicalDevice>&& device) : m_device(std::move(device)), m_dsCache()
+CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr<ILogicalDevice>&& device) : m_device(std::move(device))
 {
-	// TODO: rewrite in HLSL!
+	auto system = m_device->getPhysicalDevice()->getSystem();
+	// TODO: Reuse asset manager from elsewhere?
+	auto assetManager = core::make_smart_refctd_ptr<asset::IAssetManager>(core::smart_refctd_ptr<system::ISystem>(system));
+
+	auto loadShader = [&](const char* path)
+		{
+			asset::IAssetLoader::SAssetLoadParams params = {};
+			auto assetBundle = assetManager->getAsset(path, params);
+			auto assets = assetBundle.getContents();
+			assert(!assets.empty());
+
+			auto cpuShader = asset::IAsset::castDown<asset::ICPUShader>(assets[0]);
+			auto shader = m_device->createShader(cpuShader.get());
+			return shader;
+		};
+	auto shader = loadShader("../../../include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl");
+	const asset::SPushConstantRange transferInfoPushConstants = { asset::IShader::ESS_COMPUTE,0u,sizeof(nbl::hlsl::property_pools::TransferDispatchInfo) };
+	auto layout = m_device->createPipelineLayout({ &transferInfoPushConstants,1u });
+
+	{
+		video::IGPUComputePipeline::SCreationParams params = {};
+		params.layout = layout.get();
+		params.shader.shader = shader.get();
+
+		m_device->createComputePipelines(nullptr, { &params, 1 }, &m_pipeline);
+	}
+
 #if 0
 	const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
 	m_maxPropertiesPerPass = core::min<uint32_t>((deviceLimits.maxPerStageDescriptorSSBOs-2u)/2u,MaxPropertiesPerDispatch);
@@ -59,15 +84,122 @@ CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr<ILogicalDevice
 #endif
 }
 
+uint64_t getAlignment(uint64_t addr)
+{
+	return addr & 7;
+}
 
 bool CPropertyPoolHandler::transferProperties(
-	IGPUCommandBuffer* const cmdbuf, IGPUFence* const fence,
+	IGPUCommandBuffer* const cmdbuf, //IGPUFence* const fence,
 	const asset::SBufferBinding<video::IGPUBuffer>& scratch, const asset::SBufferBinding<video::IGPUBuffer>& addresses,
 	const TransferRequest* const requestsBegin, const TransferRequest* const requestsEnd,
-	system::logger_opt_ptr logger, const uint32_t baseDWORD, const uint32_t endDWORD
+	system::logger_opt_ptr logger, const uint32_t baseOffsetBytes, const uint32_t endOffsetBytes
 )
 {
-	assert(false); // TODO: Atil
+	if (requestsBegin==requestsEnd)
+		return true;
+	if (!scratch.buffer || !scratch.buffer->getCreationParams().usage.hasFlags(IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF))
+	{
+		logger.log("CPropertyPoolHandler: Need a valid scratch buffer which can have updates staged from the commandbuffer!",system::ILogger::ELL_ERROR);
+		return false;
+	}
+	// TODO: validate usage flags
+	uint32_t maxScratchSize = MaxPropertiesPerDispatch * sizeof(nbl::hlsl::property_pools::TransferRequest);
+	if (scratch.offset + maxScratchSize > scratch.buffer->getSize())
+	    logger.log("CPropertyPoolHandler: The scratch buffer binding provided might not be big enough in the worst case! (Scratch buffer size: %i Max scratch size: %i)",
+			system::ILogger::ELL_WARNING,
+			scratch.buffer->getSize() - scratch.offset,
+			maxScratchSize);
+
+	const auto totalProps = std::distance(requestsBegin,requestsEnd);
+	bool success = true;
+	
+	uint32_t numberOfPasses = totalProps / MaxPropertiesPerDispatch;
+	nbl::hlsl::property_pools::TransferRequest transferRequestsData[MaxPropertiesPerDispatch];
+	uint64_t scratchBufferDeviceAddr = scratch.buffer.get()->getDeviceAddress() + scratch.offset;
+	uint64_t addressBufferDeviceAddr = addresses.buffer.get()->getDeviceAddress() + addresses.offset;
+
+	for (uint32_t transferPassRequestsIndex = 0; transferPassRequestsIndex < totalProps; transferPassRequestsIndex += MaxPropertiesPerDispatch)
+	{
+		const TransferRequest* transferPassRequests = requestsBegin + transferPassRequestsIndex;
+		uint32_t requestsThisPass = core::min<uint32_t>(std::distance(transferPassRequests, requestsEnd), MaxPropertiesPerDispatch);
+		uint64_t maxElements = 0;
+		for (uint32_t i = 0; i < requestsThisPass; i ++)
+		{
+			auto& transferRequest = transferRequestsData[i];
+			auto srcRequest = transferPassRequests + i;
+			transferRequest.srcAddr = srcRequest->memblock.buffer.get()->getDeviceAddress() + srcRequest->memblock.offset;
+			transferRequest.dstAddr = srcRequest->buffer.buffer.get()->getDeviceAddress() + srcRequest->buffer.offset;
+			transferRequest.srcIndexAddr = srcRequest->srcAddressesOffset != IPropertyPool::invalid ? addressBufferDeviceAddr + srcRequest->srcAddressesOffset : 0;
+			transferRequest.dstIndexAddr = srcRequest->dstAddressesOffset != IPropertyPool::invalid ? addressBufferDeviceAddr + srcRequest->dstAddressesOffset : 0;
+			transferRequest.elementCount32 = uint32_t(srcRequest->elementCount & (uint64_t(1) << 32) - 1);
+			transferRequest.elementCountExtra = uint32_t(srcRequest->elementCount >> 32);
+			transferRequest.propertySize = srcRequest->elementSize;
+			transferRequest.fill = 0; // TODO
+			transferRequest.srcIndexSizeLog2 = 1u; // TODO
+			transferRequest.dstIndexSizeLog2 = 1u; // TODO
+			if (getAlignment(transferRequest.srcAddr) != 0)
+			{
+				logger.log("CPropertyPoolHandler: memblock.buffer BDA address %I64i is not aligned to 8 byte (64 bit)",system::ILogger::ELL_ERROR, transferRequest.srcAddr);
+			}
+			if (getAlignment(transferRequest.dstAddr) != 0)
+			{
+				logger.log("CPropertyPoolHandler: buffer.buffer BDA address %I64i is not aligned to 8 byte (64 bit)",system::ILogger::ELL_ERROR, transferRequest.dstAddr);
+			}
+			if (getAlignment(transferRequest.propertySize) != 0)
+			{
+				logger.log("CPropertyPoolHandler: propertySize %i is not aligned to 8 byte (64 bit)",system::ILogger::ELL_ERROR, srcRequest->elementSize);
+			}
+			if (transferRequest.srcIndexSizeLog2 < 1 || transferRequest.srcIndexSizeLog2 > 3)
+			{
+				auto srcIndexSizeLog2 = transferRequest.srcIndexSizeLog2;
+				logger.log("CPropertyPoolHandler: srcIndexSizeLog2 %i (%i bit values) are unsupported",system::ILogger::ELL_ERROR, srcIndexSizeLog2, (1 << transferRequest.srcIndexSizeLog2) * sizeof(uint8_t));
+			}
+			if (transferRequest.dstIndexSizeLog2 < 1 || transferRequest.dstIndexSizeLog2 > 3)
+			{
+				auto dstIndexSizeLog2 = transferRequest.dstIndexSizeLog2;
+				logger.log("CPropertyPoolHandler: dstIndexSizeLog2 %i (%i bit values) are unsupported",system::ILogger::ELL_ERROR, dstIndexSizeLog2, (1 << transferRequest.srcIndexSizeLog2) * sizeof(uint8_t));
+			}
+
+			maxElements = core::max<uint64_t>(maxElements, srcRequest->elementCount);
+		}
+		cmdbuf->updateBuffer({ scratch.offset,sizeof(TransferRequest) * requestsThisPass, core::smart_refctd_ptr(scratch.buffer) }, transferRequestsData);
+		
+		const asset::SMemoryBarrier barriers[1] = { {
+			.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
+			.srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+			.dstStageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+			.dstAccessMask = asset::ACCESS_FLAGS::SHADER_READ_BITS
+		} };
+		cmdbuf->pipelineBarrier(asset::EDF_NONE,IGPUCommandBuffer::SPipelineBarrierDependencyInfo{
+			.memBarriers = barriers
+			// TODO: .bufBarriers = instead
+		});
+
+		cmdbuf->bindComputePipeline(m_pipeline.get());
+		
+		nbl::hlsl::property_pools::TransferDispatchInfo pushConstants;
+		{
+			// TODO: Should the offset bytes be handled elsewhere?
+			pushConstants.beginOffset = baseOffsetBytes;
+			pushConstants.endOffset = endOffsetBytes;
+			pushConstants.transferCommandsAddress = scratchBufferDeviceAddr + transferPassRequestsIndex * sizeof(TransferRequest);
+		}
+		assert(getAlignment(scratchBufferDeviceAddr) == 0);
+		assert(getAlignment(sizeof(TransferRequest)) == 0);
+		cmdbuf->pushConstants(m_pipeline->getLayout(), asset::IShader::ESS_COMPUTE, 0u, sizeof(pushConstants), &pushConstants);
+
+		// dispatch
+		{
+			const auto& limits = m_device->getPhysicalDevice()->getLimits();
+			const auto invocationCoarseness = limits.maxOptimallyResidentWorkgroupInvocations * requestsThisPass;
+			const auto dispatchElements = (maxElements - 1) / requestsThisPass + 1;
+			cmdbuf->dispatch(limits.computeOptimalPersistentWorkgroupDispatchSize(dispatchElements,invocationCoarseness), requestsThisPass, 1u);
+		}
+		// TODO: pipeline barrier
+	}
+
+	return success;
 #if 0
 	if (requestsBegin==requestsEnd)
 		return true;
@@ -186,6 +318,8 @@ bool CPropertyPoolHandler::transferProperties(
 #endif
 }
 
+#if 0 // TODO: up streaming requests
+
 uint32_t CPropertyPoolHandler::transferProperties(
 	StreamingTransientDataBufferMT<>* const upBuff, IGPUCommandBuffer* const cmdbuf, IGPUFence* const fence, IQueue* const queue,
 	const asset::SBufferBinding<video::IGPUBuffer>& scratch, UpStreamingRequest* &requests, const uint32_t requestCount,
@@ -534,69 +668,5 @@ uint32_t CPropertyPoolHandler::transferProperties(
 	return 0u;
 }
 
-uint32_t CPropertyPoolHandler::TransferDescriptorSetCache::acquireSet(
-	CPropertyPoolHandler* handler, const asset::SBufferBinding<video::IGPUBuffer>& scratch, const asset::SBufferBinding<video::IGPUBuffer>& addresses,
-	const TransferRequest* requests, const uint32_t propertyCount
-)
-{
-	auto retval = IDescriptorSetCache::acquireSet();
-	if (retval==IDescriptorSetCache::invalid_index)
-		return IDescriptorSetCache::invalid_index;
-	
-
-	auto device = handler->getDevice();
-	const auto maxPropertiesPerPass = handler->getMaxPropertiesPerTransferDispatch();
-
-
-	IGPUDescriptorSet::SDescriptorInfo infos[MaxPropertiesPerDispatch*2u+2u];
-	infos[0] = scratch;
-	infos[0].info.buffer.size = sizeof(nbl_glsl_property_pool_transfer_t)*propertyCount;
-	infos[1] = addresses;
-	auto* inDescInfo = infos+2;
-	auto* outDescInfo = infos+2+maxPropertiesPerPass;
-	for (uint32_t i=0u; i<propertyCount; i++)
-	{
-		const auto& request = requests[i];
-			
-		const auto& memblock = request.memblock;
+#endif
 
-		// no not attempt to bind sized ranges of the buffers, remember that the copies are indexed, so the reads and writes may be scattered
-		if (request.isDownload())
-		{
-			inDescInfo[i] = memblock;
-			outDescInfo[i] = request.buffer;
-		}
-		else
-		{
-			inDescInfo[i] = request.buffer;
-			outDescInfo[i] = memblock;
-		}
-	}
-	// just to make Vulkan shut up
-	for (uint32_t i=propertyCount; i<maxPropertiesPerPass; i++)
-	{
-		inDescInfo[i] = scratch;
-		outDescInfo[i] = scratch;
-	}
-	IGPUDescriptorSet::SWriteDescriptorSet writes[4u];
-	IGPUDescriptorSet* const set = IDescriptorSetCache::getSet(retval);
-	for (auto i=0u; i<4u; i++)
-	{
-		writes[i].dstSet = set;
-		writes[i].binding = i;
-		writes[i].arrayElement = 0u;
-		writes[i].descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER;
-	}
-	writes[0].count = 1u;
-	writes[0].info = infos;
-	writes[1].count = 1u;
-	writes[1].info = infos+1u;
-	writes[2].count = maxPropertiesPerPass;
-	writes[2].info = inDescInfo;
-	writes[3].count = maxPropertiesPerPass;
-	writes[3].info = outDescInfo;
-	device->updateDescriptorSets(4u, writes, 0u, nullptr);
-
-	return retval;
-}
-#endif
\ No newline at end of file
diff --git a/src/nbl/video/utilities/IPropertyPool.cpp b/src/nbl/video/utilities/IPropertyPool.cpp
index 2aec9387f8..683954ee55 100644
--- a/src/nbl/video/utilities/IPropertyPool.cpp
+++ b/src/nbl/video/utilities/IPropertyPool.cpp
@@ -20,7 +20,7 @@ IPropertyPool::IPropertyPool(uint32_t capacity, void* reserved, bool contiguous)
 {
     if (contiguous)
     {
-        m_indexToAddr = reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(reserved)+getReservedSize(capacity));
+        m_indexToAddr = reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(reserved)+getReservedSize(capacity));
         m_addrToIndex = m_indexToAddr+capacity;
 
         std::fill_n(m_indexToAddr,capacity,invalid);