diff --git a/26_CentralLimitBoxBlur/CMakeLists.txt b/26_CentralLimitBoxBlur/CMakeLists.txt
new file mode 100644
index 000000000..bd3146859
--- /dev/null
+++ b/26_CentralLimitBoxBlur/CMakeLists.txt
@@ -0,0 +1,19 @@
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl b/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
new file mode 100644
index 000000000..a2226fa45
--- /dev/null
+++ b/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
@@ -0,0 +1,50 @@
+#include "nbl/builtin/hlsl/blur/common.hlsl"
+
+[[vk::binding( 0, 0 )]] Texture2D<nbl::hlsl::float32_t4> input;
+[[vk::binding( 1, 0 )]] RWTexture2D<nbl::hlsl::float32_t4> output;
+
+
+// TODO: figure the proper way to do templated BufferAccessors
+struct BufferAccessor
+{
+	uint32_t2 chosenAxis;
+	
+	nbl::hlsl::float32_t get( const uint32_t linearIndex, const uint32_t channel )
+	{
+		uint32_t3 texSize;
+		input.GetDimensions( 0, texSize.x, texSize.y, texSize.z );
+
+		uint32_t axisSize = dot( texSize.xy, chosenAxis );
+
+		uint32_t2 coordinate = { linearIndex % axisSize, linearIndex / axisSize };
+		float32_t data = 0.f;
+		if( all( coordinate < texSize.xy ) )
+		{
+			float32_t4 pixel = input[ coordinate.xy ];
+			data = pixel[ channel ];
+		}
+
+		return data;
+	}
+
+	void set( const uint32_t linearIndex, const uint32_t channel, NBL_CONST_REF_ARG( float32_t ) val )
+	{
+		uint32_t2 texSize;
+		output.GetDimensions( texSize.x, texSize.y );
+
+		uint32_t axisSize = dot( texSize, chosenAxis );
+
+		uint32_t2 coordinate = { linearIndex % axisSize, linearIndex / axisSize };
+		if( all( coordinate < texSize ) )
+		{
+			output[ coordinate.xy ][ channel ] = val;
+		}
+	}
+};
+
+BufferAccessor BufferAccessorCtor( uint32_t2 chosenAxis )
+{
+	BufferAccessor ba;
+	ba.chosenAxis = chosenAxis;
+	return ba;
+}
diff --git a/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl b/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
new file mode 100644
index 000000000..dbcef350e
--- /dev/null
+++ b/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#pragma shader_stage(compute)
+
+#include "nbl/builtin/hlsl/blur/common.hlsl"
+#include "descriptors.hlsl"
+
+#include "nbl/builtin/hlsl/blur/box_blur.hlsl"
+
+[[vk::push_constant]]
+BoxBlurParams boxBlurParams;
+
+[numthreads( WORKGROUP_SIZE, 1, 1 )]
+void main( uint3 invocationID : SV_DispatchThreadID )
+{
+	uint32_t direction = boxBlurParams.getDirection();
+	uint32_t wrapMode = boxBlurParams.getWrapMode();
+	nbl::hlsl::float32_t4 borderColor = float32_t4(1.f, 0.f, 1.f, 1.f);
+	if( boxBlurParams.getWrapMode() == WRAP_MODE_CLAMP_TO_BORDER )
+	{
+		borderColor = boxBlurParams.getBorderColor();
+	}
+
+	BufferAccessor textureAccessor = BufferAccessorCtor( boxBlurParams.chosenAxis );
+
+	for( uint32_t ch = 0; ch < boxBlurParams.getChannelCount(); ++ch )
+	{
+		BoxBlur( ch, boxBlurParams.radius, wrapMode, borderColor, textureAccessor );
+	}
+}
diff --git a/26_CentralLimitBoxBlur/main.cpp b/26_CentralLimitBoxBlur/main.cpp
new file mode 100644
index 000000000..f89fd09b8
--- /dev/null
+++ b/26_CentralLimitBoxBlur/main.cpp
@@ -0,0 +1,339 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
+#include "../common/MonoDeviceApplication.hpp"
+#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+#include <nbl/builtin/hlsl/blur/common.hlsl>
+
+#include "CArchive.h"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+#define _NBL_PLATFORM_WINDOWS_
+
+class BoxBlurDemo final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
+{
+	using device_base_t = examples::MonoDeviceApplication;
+	using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
+
+public:
+	BoxBlurDemo( 
+		const path& _localInputCWD, 
+		const path& _localOutputCWD, 
+		const path& _sharedInputCWD, 
+		const path& _sharedOutputCWD 
+	) : system::IApplicationFramework( _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD )
+	{}
+	
+	bool onAppInitialized( smart_refctd_ptr<ISystem>&& system ) override
+	{
+		// Remember to call the base class initialization!
+		if( !device_base_t::onAppInitialized( std::move( system ) ) )
+		{
+			return false;
+		}
+		if( !asset_base_t::onAppInitialized( std::move( system ) ) )
+		{
+			return false;
+		}
+
+		constexpr uint32_t WorkgroupSize = 256;
+		constexpr uint32_t AxisDimension = 3;
+		constexpr uint32_t PassesPerAxis = 4;
+
+		constexpr uint32_t WorkgroupCount = 2048;
+
+		IAssetLoader::SAssetLoadParams lparams = {};
+		lparams.logger = m_logger.get();
+		lparams.workingDirectory = "";
+		auto checkedLoad = [ & ]<class T>( const char* filePath ) -> smart_refctd_ptr<T>
+		{
+			// The `IAssetManager::getAsset` function is very complex, in essencee it:
+			// 1. takes a cache key or an IFile, if you gave it an `IFile` skip to step 3
+			// 2. it consults the loader override about how to get an `IFile` from your cache key
+			// 3. handles any failure in opening an `IFile` (which is why it takes a supposed filename), it allows the override to give a different file
+			// 4. tries to derive a working directory if you haven't provided one
+			// 5. looks for the assets in the cache if you haven't disabled that in the loader parameters
+			// 5a. lets the override choose relevant assets from the ones found under the cache key
+			// 5b. if nothing was found it lets the override intervene one last time
+			// 6. if there's no file to load from, return no assets
+			// 7. try all loaders associated with a file extension
+			// 8. then try all loaders by opening the file and checking if it will load
+			// 9. insert loaded assets into cache if required
+			// 10. restore assets from dummy state if needed (more on that in other examples)
+			// Take the docs with a grain of salt, the `getAsset` will be rewritten to deal with restores better in the near future.
+			nbl::asset::SAssetBundle bundle = m_assetMgr->getAsset( filePath, lparams );
+			if( bundle.getContents().empty() )
+			{
+				m_logger->log( "Asset %s failed to load! Are you sure it exists?", ILogger::ELL_ERROR, filePath );
+				return nullptr;
+			}
+			// All assets derive from `nbl::asset::IAsset`, and can be casted down if the type matches
+			static_assert( std::is_base_of_v<nbl::asset::IAsset, T> );
+			// The type of the root assets in the bundle is not known until runtime, so this is kinda like a `dynamic_cast` which will return nullptr on type mismatch
+			auto typedAsset = IAsset::castDown<T>( bundle.getContents()[ 0 ] ); // just grab the first asset in the bundle
+			if( !typedAsset )
+			{
+				m_logger->log( "Asset type mismatch want %d got %d !", ILogger::ELL_ERROR, T::AssetType, bundle.getAssetType() );
+
+			}
+			return typedAsset;
+		};
+
+		auto textureToBlur = checkedLoad.operator()< nbl::asset::ICPUImage >( "app_resources/tex.jpg" );
+		const auto& inCpuTexInfo = textureToBlur->getCreationParameters();
+		
+		auto createGPUImages = [ & ](
+			core::bitflag<IGPUImage::E_USAGE_FLAGS> usageFlags,
+			std::string_view name,
+			smart_refctd_ptr<nbl::video::IGPUImage>&& imgOut,
+			smart_refctd_ptr<nbl::video::IGPUImageView>&& imgViewOut
+		) {
+			video::IGPUImage::SCreationParams gpuImageCreateInfo;
+			gpuImageCreateInfo.flags = inCpuTexInfo.flags;
+			gpuImageCreateInfo.type = inCpuTexInfo.type;
+			gpuImageCreateInfo.extent = inCpuTexInfo.extent;
+			gpuImageCreateInfo.mipLevels = inCpuTexInfo.mipLevels;
+			gpuImageCreateInfo.arrayLayers = inCpuTexInfo.arrayLayers;
+			gpuImageCreateInfo.samples = inCpuTexInfo.samples;
+			gpuImageCreateInfo.tiling = video::IGPUImage::TILING::OPTIMAL;
+			gpuImageCreateInfo.usage = usageFlags | asset::IImage::EUF_TRANSFER_DST_BIT;
+			gpuImageCreateInfo.queueFamilyIndexCount = 0u;
+			gpuImageCreateInfo.queueFamilyIndices = nullptr;
+
+			gpuImageCreateInfo.format = m_physicalDevice->promoteImageFormat(
+				{ inCpuTexInfo.format, gpuImageCreateInfo.usage }, gpuImageCreateInfo.tiling
+			);
+			auto gpuImage = m_device->createImage( std::move( gpuImageCreateInfo ) );
+
+			auto gpuImageMemReqs = gpuImage->getMemoryReqs();
+			gpuImageMemReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
+			m_device->allocate( gpuImageMemReqs, gpuImage.get(), video::IDeviceMemoryAllocation::EMAF_NONE );
+
+			auto imgView = m_device->createImageView( {
+				.flags = IGPUImageView::ECF_NONE,
+				.subUsages = usageFlags,
+				.image = gpuImage,
+				.viewType = IGPUImageView::ET_2D,
+				.format = gpuImageCreateInfo.format
+			} );
+			gpuImage->setObjectDebugName( name.data() );
+			imgView->setObjectDebugName( ( std::string{ name } + "view" ).c_str() );
+			imgOut = gpuImage;
+			imgViewOut = imgView;
+		};
+
+
+		smart_refctd_ptr<nbl::video::IGPUImage> inputGpuImg;
+		smart_refctd_ptr<nbl::video::IGPUImage> outputGpuImg;
+		smart_refctd_ptr<nbl::video::IGPUImageView> inputGpuImgView;
+		smart_refctd_ptr<nbl::video::IGPUImageView> outputGpuImgView;
+		createGPUImages( IGPUImage::EUF_SAMPLED_BIT, "InputImg", std::move(inputGpuImg), std::move(inputGpuImgView));
+		createGPUImages( IGPUImage::EUF_STORAGE_BIT, "OutputImg", std::move(outputGpuImg), std::move(outputGpuImgView));
+
+
+		auto computeMain = checkedLoad.operator()< nbl::asset::ICPUShader >( "app_resources/main.comp.hlsl" );
+		smart_refctd_ptr<ICPUShader> overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+			computeMain.get(), 
+			"#define WORKGROUP_SIZE %s\n#define PASSES_PER_AXIS %d\n#define AXIS_DIM %d\n",
+			std::to_string( WorkgroupSize ).c_str(), AxisDimension, PassesPerAxis
+		);
+		smart_refctd_ptr<IGPUShader> shader = m_device->createShader( overridenUnspecialized.get() );
+		if( !shader )
+		{
+			return logFail( "Creation of a GPU Shader to from CPU Shader source failed!" );
+		}
+
+
+		// TODO: move to shaderd cpp/hlsl descriptors file
+		NBL_CONSTEXPR_STATIC nbl::video::IGPUDescriptorSetLayout::SBinding bindings[] = {
+			{
+				.binding = 0,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, 
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::ESS_COMPUTE,
+				.count = 1,
+				.samplers = nullptr
+			},
+			{
+				.binding = 1,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::ESS_COMPUTE,
+				.count = 1,
+				.samplers = nullptr 
+			}
+		};
+		smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout = m_device->createDescriptorSetLayout( bindings );
+		if( !dsLayout )
+		{
+			return logFail( "Failed to create a Descriptor Layout!\n" );
+		}
+		const asset::SPushConstantRange pushConst[] = { {.stageFlags = IShader::ESS_COMPUTE, .offset = 0, .size = sizeof( BoxBlurParams )} };
+		smart_refctd_ptr<nbl::video::IGPUPipelineLayout> pplnLayout = m_device->createPipelineLayout( pushConst, smart_refctd_ptr(dsLayout));
+		if( !pplnLayout )
+		{
+			return logFail( "Failed to create a Pipeline Layout!\n" );
+		}
+
+		smart_refctd_ptr<nbl::video::IGPUComputePipeline> pipeline;
+		{
+			IGPUComputePipeline::SCreationParams params = {};
+			params.layout = pplnLayout.get();
+			params.shader.entryPoint = "main";
+			params.shader.shader = shader.get();
+			// we'll cover the specialization constant API in another example
+			if( !m_device->createComputePipelines( nullptr, { &params, 1 }, &pipeline ) )
+			{
+				return logFail( "Failed to create pipelines (compile & link shaders)!\n" );
+			}
+		}
+		smart_refctd_ptr<video::IGPUSampler> sampler = m_device->createSampler( { .TextureWrapU = ISampler::ETC_CLAMP_TO_EDGE } );
+		smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
+		smart_refctd_ptr<nbl::video::IDescriptorPool> pool = m_device->createDescriptorPoolForDSLayouts( 
+			IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 } );
+		ds = pool->createDescriptorSet( std::move( dsLayout ) );
+		{
+			IGPUDescriptorSet::SDescriptorInfo info[ 2 ];
+			info[ 0 ].desc = inputGpuImgView;
+			info[ 0 ].info.image = { .sampler = sampler, .imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL };
+			info[ 1 ].desc = outputGpuImgView;
+			info[ 1 ].info.image = { .sampler = nullptr, .imageLayout = IImage::LAYOUT::GENERAL };
+
+			IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
+				{ .dstSet = ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &info[ 0 ] },
+				{ .dstSet = ds.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &info[ 1 ] },
+			};
+			m_device->updateDescriptorSets( writes, {} );
+		}
+
+		uint32_t computeQueueIndex = getComputeQueue()->getFamilyIndex();
+		IQueue* queue = m_device->getQueue( computeQueueIndex, 0 );
+
+		smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdbuf;
+		smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(
+			computeQueueIndex, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT );
+		if( !cmdpool->createCommandBuffers( IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf ) )
+		{
+			return logFail( "Failed to create Command Buffers!\n" );
+		}
+
+		constexpr size_t StartedValue = 0;
+		constexpr size_t FinishedValue = 45;
+		static_assert( FinishedValue > StartedValue );
+		smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore( StartedValue );
+		
+		IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = cmdbuf.get()} };
+
+		nbl::video::SIntendedSubmitInfo::SFrontHalf frontHalf = { .queue = queue, .commandBuffers = cmdbufs };
+		smart_refctd_ptr<nbl::video::IUtilities> assetStagingMngr = 
+			make_smart_refctd_ptr<IUtilities>( smart_refctd_ptr( m_device ), smart_refctd_ptr( m_logger ) );
+
+		cmdbuf->begin( IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT );
+
+		queue->startCapture();
+		bool uploaded = assetStagingMngr->updateImageViaStagingBufferAutoSubmit( 
+			frontHalf, textureToBlur->getBuffer(), inCpuTexInfo.format,
+			inputGpuImg.get(), IImage::LAYOUT::UNDEFINED, textureToBlur->getRegions()
+		);
+		queue->endCapture();
+		if( !uploaded )
+		{
+			return logFail( "Failed to upload cpu tex!\n" );
+		}
+
+		cmdbuf->reset( IGPUCommandBuffer::RESET_FLAGS::NONE );
+
+		BoxBlurParams pushConstData = {};
+		
+
+		cmdbuf->begin( IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT );
+		cmdbuf->beginDebugMarker( "My Compute Dispatch", core::vectorSIMDf( 0, 1, 0, 1 ) );
+		nbl::video::IGPUCommandBuffer::SImageResolve regions[] = {
+			{
+				.srcSubresource = { .layerCount = 1 },
+				.srcOffset = {},
+				.dstSubresource = { .layerCount = 1 },
+				.dstOffset = {},
+				.extent = inputGpuImg->getCreationParameters().extent
+			}
+		};
+		cmdbuf->resolveImage( 
+			inputGpuImg.get(), IImage::LAYOUT::UNDEFINED,
+			inputGpuImg.get(), IImage::LAYOUT::GENERAL,
+			std::size( regions ), regions );
+		nbl::video::IGPUCommandBuffer::SImageResolve regionsOut[] = {
+			{
+				.srcSubresource = {.layerCount = 1 },
+				.srcOffset = {},
+				.dstSubresource = {.layerCount = 1 },
+				.dstOffset = {},
+				.extent = outputGpuImg->getCreationParameters().extent
+			}
+		};
+		cmdbuf->resolveImage(
+			outputGpuImg.get(), IImage::LAYOUT::UNDEFINED,
+			outputGpuImg.get(), IImage::LAYOUT::GENERAL,
+			std::size( regionsOut ), regionsOut );
+		cmdbuf->bindComputePipeline( pipeline.get() );
+		cmdbuf->bindDescriptorSets( nbl::asset::EPBP_COMPUTE, pplnLayout.get(), 0, 1, &ds.get() );
+		cmdbuf->pushConstants( pplnLayout.get(), IShader::ESS_COMPUTE, 0, sizeof( BoxBlurParams ), &pushConstData );
+		cmdbuf->dispatch( WorkgroupCount, 1, 1 );
+
+		const nbl::asset::SMemoryBarrier barriers[] = {
+			{
+				.srcStageMask = nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.srcAccessMask= nbl::asset::ACCESS_FLAGS::SHADER_WRITE_BITS,
+				.dstStageMask= nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.dstAccessMask= nbl::asset::ACCESS_FLAGS::SHADER_READ_BITS,
+			}
+		};
+		cmdbuf->pipelineBarrier( nbl::asset::EDF_NONE, { .memBarriers = barriers } );
+
+		cmdbuf->dispatch( WorkgroupCount, 1, 1 );
+		cmdbuf->endDebugMarker();
+		// Normally you'd want to perform a memory barrier when using the output of a compute shader or renderpass,
+		// however waiting on a timeline semaphore (or fence) on the Host makes all Device writes visible.
+		cmdbuf->end();
+		
+		{
+			// The IGPUCommandBuffer is the only object whose usage does not get automagically tracked internally, you're responsible for holding onto it as long as the GPU needs it.
+			// So this is why our commandbuffer, even though its transient lives in the scope equal or above the place where we wait for the submission to be signalled as complete.
+			const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = cmdbuf.get()} };
+			// But we do need to signal completion by incrementing the Timeline Semaphore counter as soon as the compute shader is done
+			const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = progress.get(),.value = FinishedValue,.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} };
+			// Default, we have no semaphores to wait on before we can start our workload
+			IQueue::SSubmitInfo submitInfos[] = { { .commandBuffers = cmdbufs, .signalSemaphores = signals } };
+
+			// We have a cool integration with RenderDoc that allows you to start and end captures programmatically.
+			// This is super useful for debugging multi-queue workloads and by default RenderDoc delimits captures only by Swapchain presents.
+			queue->startCapture();
+			queue->submit( submitInfos );
+			queue->endCapture();
+		}
+		// As the name implies this function will not progress until the fence signals or repeated waiting returns an error.
+		const ISemaphore::SWaitInfo waitInfos[] = { { .semaphore = progress.get(), .value = FinishedValue } };
+		m_device->blockForSemaphores( waitInfos );
+			
+
+		return true;
+	}
+
+	// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
+	void workLoopBody() override {}
+
+	// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
+	bool keepRunning() override { return false; }
+
+};
+
+
+NBL_MAIN_FUNC( BoxBlurDemo )
\ No newline at end of file
diff --git a/66_PropertyPools/CMakeLists.txt b/66_PropertyPools/CMakeLists.txt
new file mode 100644
index 000000000..bc1624875
--- /dev/null
+++ b/66_PropertyPools/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/66_PropertyPools/app_resources/common.hlsl b/66_PropertyPools/app_resources/common.hlsl
new file mode 100644
index 000000000..6f339aa13
--- /dev/null
+++ b/66_PropertyPools/app_resources/common.hlsl
@@ -0,0 +1,22 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+// Unfortunately not every piece of C++14 metaprogramming syntax is available in HLSL 202x
+// https://github.com/microsoft/DirectXShaderCompiler/issues/5751#issuecomment-1800847954
+typedef nbl::hlsl::float32_t3 input_t;
+typedef nbl::hlsl::float32_t output_t;
+
+NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxPossibleElementCount = 1 << 20;
+
+struct PushConstantData
+{
+	uint64_t inputAddress;
+	uint64_t outputAddress;
+	uint32_t dataElementCount;
+};
+
+NBL_CONSTEXPR uint32_t WorkgroupSize = 256;
+
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+
+// Yes we do have our own re-creation of C++'s STL in HLSL2021 !
+#include "nbl/builtin/hlsl/limits.hlsl"
\ No newline at end of file
diff --git a/66_PropertyPools/app_resources/shader.comp.hlsl b/66_PropertyPools/app_resources/shader.comp.hlsl
new file mode 100644
index 000000000..4aeef0e0f
--- /dev/null
+++ b/66_PropertyPools/app_resources/shader.comp.hlsl
@@ -0,0 +1,33 @@
+#include "common.hlsl"
+
+// just a small test
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+
+[[vk::push_constant]] PushConstantData pushConstants;
+
+// does absolutely nothing, a later example will show how it gets used
+template<typename capability_traits=nbl::hlsl::jit::device_capabilities_traits>
+void dummyTraitTest() {}
+
+[numthreads(WorkgroupSize,1,1)]
+void main(uint32_t3 ID : SV_DispatchThreadID)
+{
+	dummyTraitTest();
+	if (ID.x>=pushConstants.dataElementCount)
+		return;
+
+	const input_t self = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*ID.x);
+
+	nbl::hlsl::Xoroshiro64StarStar rng = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(pushConstants.dataElementCount,ID.x)^0xdeadbeefu);
+
+	float32_t acc = nbl::hlsl::numeric_limits<float32_t>::max;
+	const static uint32_t OthersToTest = 15;
+	[[unroll(OthersToTest)]]
+	for (uint32_t i=0; i<OthersToTest; i++)
+	{
+		const uint32_t offset = rng() % pushConstants.dataElementCount;
+		const input_t other = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*offset);
+		acc = min(length(other-self),acc);
+	}
+	vk::RawBufferStore<float32_t>(pushConstants.outputAddress+sizeof(float32_t)*ID.x,acc);
+}
\ No newline at end of file
diff --git a/66_PropertyPools/config.json.template b/66_PropertyPools/config.json.template
new file mode 100644
index 000000000..717d05d53
--- /dev/null
+++ b/66_PropertyPools/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
new file mode 100644
index 000000000..e1ab9d7b3
--- /dev/null
+++ b/66_PropertyPools/main.cpp
@@ -0,0 +1,419 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+#include "nbl/video/surface/CSurfaceVulkan.h"
+
+#include "../common/BasicMultiQueueApplication.hpp"
+#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+namespace nbl::examples
+{
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace ui;
+using namespace asset;
+using namespace video;
+
+// Virtual Inheritance because apps might end up doing diamond inheritance
+class WindowedApplication : public virtual BasicMultiQueueApplication
+{
+		using base_t = BasicMultiQueueApplication;
+
+	public:
+		using base_t::base_t;
+
+		virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
+		{
+			auto retval = base_t::getAPIFeaturesToEnable();
+			// We only support one swapchain mode, surface, the other one is Display which we have not implemented yet.
+			retval.swapchainMode = video::E_SWAPCHAIN_MODE::ESM_SURFACE;
+			return retval;
+		}
+
+		// New function, we neeed to know about surfaces to create ahead of time
+		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;
+
+		virtual core::set<video::IPhysicalDevice*> filterDevices(const core::SRange<video::IPhysicalDevice* const>& physicalDevices) const
+		{
+			const auto firstFilter = base_t::filterDevices(physicalDevices);
+
+			video::SPhysicalDeviceFilter deviceFilter = {};
+			
+			const auto surfaces = getSurfaces();
+			deviceFilter.requiredSurfaceCompatibilities = surfaces.data();
+			deviceFilter.requiredSurfaceCompatibilitiesCount = surfaces.size();
+
+			return deviceFilter(physicalDevices);
+		}
+		
+		virtual bool onAppInitialized(smart_refctd_ptr<ISystem>&& system)
+		{
+			// Remember to call the base class initialization!
+			if (!base_t::onAppInitialized(std::move(system)))
+				return false;
+
+		#ifdef _NBL_PLATFORM_WINDOWS_
+			m_winMgr = nbl::ui::IWindowManagerWin32::create();
+		#else
+			#error "Unimplemented!"
+		#endif
+		}
+
+		core::smart_refctd_ptr<ui::IWindowManager> m_winMgr;
+};
+
+
+// Before we get onto creating a window, we need to discuss how Nabla handles input, clipboards and cursor control
+class IWindowClosedCallback : public virtual nbl::ui::IWindow::IEventCallback
+{
+	public:
+		IWindowClosedCallback() : m_gotWindowClosedMsg(false) {}
+
+		// unless you create a separate callback per window, both will "trip" this condition
+		bool windowGotClosed() const {return m_gotWindowClosedMsg;}
+
+	private:
+		bool onWindowClosed_impl() override
+		{
+			m_gotWindowClosedMsg = true;
+			return true;
+		}
+
+		bool m_gotWindowClosedMsg;
+};
+
+// We inherit from an application that tries to find Graphics and Compute queues
+// because applications with presentable images often want to perform Graphics family operations
+// Virtual Inheritance because apps might end up doing diamond inheritance
+class SingleNonResizableWindowApplication : public virtual WindowedApplication
+{
+		using base_t = WindowedApplication;
+
+	protected:
+		virtual IWindow::SCreationParams getWindowCreationParams() const
+		{
+			IWindow::SCreationParams params = {};
+			params.callback = make_smart_refctd_ptr<IWindowClosedCallback>();
+			params.width = 640;
+			params.height = 480;
+			params.x = 32;
+			params.y = 32;
+			params.flags = IWindow::ECF_NONE;
+			params.windowCaption = "SingleNonResizableWindowApplication";
+			return params;
+		}
+
+		core::smart_refctd_ptr<ui::IWindow> m_window;
+		core::smart_refctd_ptr<video::ISurfaceVulkan> m_surface;
+
+	public:
+		using base_t::base_t;
+
+		virtual bool onAppInitialized(smart_refctd_ptr<nbl::system::ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			m_window = m_winMgr->createWindow(getWindowCreationParams());
+			m_surface = video::CSurfaceVulkanWin32::create(core::smart_refctd_ptr(m_api),core::smart_refctd_ptr_static_cast<ui::IWindowWin32>(m_window));
+			return true;
+		}
+
+		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
+		{
+			return {{m_surface.get()/*,EQF_NONE*/}};
+		}
+
+		virtual bool keepRunning() override
+		{
+			if (!m_window || reinterpret_cast<const IWindowClosedCallback*>(m_window->getEventCallback())->windowGotClosed())
+				return false;
+
+			return true;
+		}
+};
+}
+
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace ui;
+using namespace asset;
+using namespace video;
+
+
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+
+
+// In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
+class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
+{
+		using device_base_t = examples::MonoDeviceApplication;
+		using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
+
+		// This is the first example that submits multiple workloads in-flight. 
+		// What the shader does is it computes the minimum distance of each point against K other random input points.
+		// Having the GPU randomly access parts of the buffer requires it to be DEVICE_LOCAL for performance.
+		// Then the CPU downloads the results and finds the median minimum distance via quick-select.
+		// This bizzare synthetic workload was specifically chosen for its unfriendliness towards simple buffer usage.
+		// The fact we have variable sized workloads and run them in a loop means we either have to dynamically
+		// suballocate from a single buffer or have K worst-case sized buffers we round robin for K-workloads in flight.
+		// Creating and destroying buffers at runtime is not an option as those are very expensive operations. 
+		// Also since CPU needs to heapify the outputs, we need to have the GPU write them into RAM not VRAM.
+		smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
+
+		// The Utility class has lots of methods to handle staging without relying on ReBAR or EXT_host_image_copy as well as more complex methods we'll cover later.
+		// Until EXT_host_image_copy becomes ubiquitous across all Nabla Core Profile devices, you need to stage image copies from an IGPUBuffer to an IGPUImage.
+		// Why use Staging for buffers in the age of ReBAR? While GPU workloads overlap the CPU, individual GPU workloads's execution might not overlap each other
+		// but their data might. In this case you want to "precisely" time the data update on the GPU timeline between the end and start of a workload.
+		// For very small updates you could use the commandbuffer updateBuffer method, but it has a size limit and the data enqueued takes up space in the commandpool.
+		// Sometimes it might be unfeasible to either have multiple copies or update references to those copies without a cascade update.
+		// One example is the transformation graph of nodes in a scene, where a copy-on-write of a node would require the update the offset/pointer held by
+		// any other node that refers to it. This quickly turns into a cascade that would force you to basically create a full copy of the entire data structure
+		// after most updates. Whereas with staging you'd "queue up" the much smaller set of updates to apply between each computation step which uses the graph.
+		// Another example are UBO and SSBO bindings, where once you run out of dynamic bindings, you can no longer easily change offsets without introducting extra indirection in shaders.
+		// Actually staging can help you re-use a commandbuffer because you don't need to re-record it if you don't need to change the offsets at which you bind!
+		// Finally ReBAR is a precious resource, my 8GB RTX 3070 only reports a 214MB Heap backing HOST_VISIBLE and DEVICE_LOCAL device local memory type.
+		smart_refctd_ptr<nbl::video::IUtilities> m_utils;
+
+		// We call them downstreaming and upstreaming, simply by how we used them so far.
+		// Meaning that upstreaming is uncached and usually ReBAR (DEVICE_LOCAL), for simple memcpy like sequential writes.
+		// While the downstreaming is CACHED and not DEVICE_LOCAL for fast random acess by the CPU.
+		// However there are cases when you'd want to use a buffer with flags identical to the default downstreaming buffer for uploads,
+		// such cases is when a CPU needs to build a data-structure in-place (due to memory constraints) before GPU accesses it,
+		// one example are Host Acceleration Structure builds (BVH building requires lots of repeated memory accesses).
+		// When choosing the memory properties of a mapped buffer consider which processor (CPU or GPU) needs faster access in event of a cache-miss.
+		nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
+		StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
+		// These are Buffer Device Addresses
+		uint64_t m_upStreamingBufferAddress;
+		uint64_t m_downStreamingBufferAddress;
+
+		smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
+		smart_refctd_ptr<IGPUBuffer> m_scratchBuffer;
+		smart_refctd_ptr<IGPUBuffer> m_addressBuffer;
+		smart_refctd_ptr<IGPUBuffer> m_transferSrcBuffer;
+		smart_refctd_ptr<IGPUBuffer> m_transferDstBuffer;
+		std::vector<uint16_t> m_data;
+
+		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
+		uint32_t m_alignment;
+		
+		// The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished.
+		// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
+		smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;
+
+		// We'll run the iterations in reverse, easier to write "keep running"
+		uint32_t m_iteration = 200;
+
+		static constexpr uint64_t TransfersAmount = 1024;
+		static constexpr uint64_t MaxValuesPerTransfer = 512;
+
+	public:
+		// Yay thanks to multiple inheritance we cannot forward ctors anymore
+		PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+			system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {}
+
+		// we stuff all our work here because its a "single shot" app
+		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!device_base_t::onAppInitialized(std::move(system)))
+				return false;
+			if (!asset_base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));
+
+			auto createBuffer = [&](uint64_t size, core::bitflag<asset::IBuffer::E_USAGE_FLAGS> flags, const char* name, bool hostVisible)
+			{
+					video::IGPUBuffer::SCreationParams creationParams;
+					creationParams.size = ((size + 3) / 4) * 4; // Align
+					creationParams.usage = flags
+						| asset::IBuffer::EUF_STORAGE_BUFFER_BIT
+						| asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT 
+						| asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
+
+					auto buffer = m_device->createBuffer(std::move(creationParams));
+					nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs();
+					if (hostVisible) 
+						reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+					m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+					buffer->setObjectDebugName(name);
+
+					return buffer;
+			};
+
+			m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", false);
+			m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_addressBuffer", false);
+			m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_transferSrcBuffer", false);
+			m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_transferDstBuffer", true);
+
+			for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++)
+				m_data.push_back(i);
+
+			// this time we load a shader directly from a file
+			smart_refctd_ptr<IGPUSpecializedShader> shader;
+			{
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = ""; // virtual root
+
+				auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+					return logFail("Could not load shader!");
+
+				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+				auto source = IAsset::castDown<ICPUSpecializedShader>(assets[0]);
+				// The down-cast should not fail!
+				assert(source);
+
+				IGPUObjectFromAssetConverter::SParams conversionParams = {};
+				conversionParams.device = m_device.get();
+				conversionParams.assetManager = m_assetMgr.get();
+				created_gpu_object_array<ICPUSpecializedShader> convertedGPUObjects = std::make_unique<IGPUObjectFromAssetConverter>()->getGPUObjectsFromAssets(&source,&source+1,conversionParams);
+				if (convertedGPUObjects->empty() || !convertedGPUObjects->front())
+					return logFail("Conversion of a CPU Specialized Shader to GPU failed!");
+
+				shader = convertedGPUObjects->front();
+			}
+
+			// The StreamingTransientDataBuffers are actually composed on top of another useful utility called `CAsyncSingleBufferSubAllocator`
+			// The difference is that the streaming ones are made on top of ranges of `IGPUBuffer`s backed by mappable memory, whereas the
+			// `CAsyncSingleBufferSubAllocator` just allows you suballocate subranges of any `IGPUBuffer` range with deferred/latched frees.
+			constexpr uint32_t DownstreamBufferSize = sizeof(output_t)<<24;
+			constexpr uint32_t UpstreamBufferSize = sizeof(input_t)<<24;
+			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device),smart_refctd_ptr(m_logger),DownstreamBufferSize,UpstreamBufferSize);
+			if (!m_utils)
+				return logFail("Failed to create Utilities!");
+			m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
+			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
+			m_upStreamingBufferAddress = m_device->getBufferDeviceAddress(m_upStreamingBuffer->getBuffer());
+			m_downStreamingBufferAddress = m_device->getBufferDeviceAddress(m_downStreamingBuffer->getBuffer());
+
+			// People love Reflection but I prefer Shader Sources instead!
+			const nbl::asset::SPushConstantRange pcRange = {.stageFlags=IShader::ESS_COMPUTE,.offset=0,.size=sizeof(PushConstantData)};
+
+			// This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size
+			// and using traditional SSBO bindings would force us to update the Descriptor Set every frame.
+			// I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic
+			// only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding.
+			// Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size.
+			m_pipeline = m_device->createComputePipeline(nullptr,m_device->createPipelineLayout(&pcRange,&pcRange+1),std::move(shader));
+
+			const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
+			// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
+			// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
+			// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
+			// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
+			// We'll align to max of coherent atom size even if the memory is coherent,
+			// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
+			m_alignment = core::max(deviceLimits.nonCoherentAtomSize,alignof(float));
+
+			// We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are
+			// the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously.
+			constexpr auto MaxConcurrency = 64;
+			// Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag
+			m_poolCache = make_smart_refctd_ptr<ICommandPoolCache>(m_device.get(),getComputeQueue()->getFamilyIndex(), IGPUCommandPool::ECF_NONE, MaxConcurrency);
+
+			return true;
+		}
+
+		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
+		bool keepRunning() override { return m_iteration; }
+
+		// Finally the first actual work-loop
+		void workLoopBody() override
+		{
+			m_iteration--;
+			IGPUQueue* const queue = getComputeQueue();
+
+			// Obtain our command pool once one gets recycled
+			uint32_t poolIx;
+			do
+			{
+				poolIx = m_poolCache->acquirePool();
+			} while (poolIx==ICommandPoolCache::invalid_index);
+
+			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+			{
+				m_device->createCommandBuffers(m_poolCache->getPool(poolIx),IGPUCommandBuffer::EL_PRIMARY,1,&cmdbuf);
+				// lets record, its still a one time submit because we have to re-record with different push constants each time
+				cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
+				cmdbuf->bindComputePipeline(m_pipeline.get());
+
+				// COMMAND RECORDING
+				uint32_t dataSize = (((sizeof(uint16_t) * m_data.size()) + 3) / 4) * 4;
+				uint32_t maxUpload = 65536;
+				for (uint32_t offset = 0; offset < dataSize; offset += maxUpload)
+				{
+					cmdbuf->updateBuffer(m_transferSrcBuffer.get(), offset, maxUpload, &m_data[offset / sizeof(uint16_t)]);
+				}
+				CPropertyPoolHandler::TransferRequest transferRequest;
+				transferRequest.memblock = asset::SBufferRange<video::IGPUBuffer> { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) };
+				transferRequest.elementSize = m_data.size();
+				transferRequest.elementCount = 1;
+				transferRequest.buffer = asset::SBufferBinding<video::IGPUBuffer> { 0, core::smart_refctd_ptr<video::IGPUBuffer>(m_transferDstBuffer) };
+
+				m_propertyPoolHandler->transferProperties(cmdbuf.get(), nullptr,
+					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_scratchBuffer)}, 
+					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_addressBuffer)}, 
+					&transferRequest, &transferRequest + 1,
+					m_logger.get(), 0, MaxValuesPerTransfer
+					);
+
+				cmdbuf->end();
+			}
+
+			// TODO: redo with a single timeline semaphore
+			auto fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED);
+			{
+				IGPUQueue::SSubmitInfo submitInfo = {};
+				submitInfo.commandBufferCount = 1;
+				submitInfo.commandBuffers = &cmdbuf.get();
+
+				queue->startCapture();
+				queue->submit(1u,&submitInfo,fence.get());
+				queue->endCapture();
+			}
+
+			{
+				// Readback ds
+				auto mem = m_transferDstBuffer->getBoundMemory();
+				assert(mem->isMappable());
+				auto ptr = m_device->mapMemory(nbl::video::IDeviceMemoryAllocation::MappedMemoryRange(mem, 0, mem->getAllocationSize()), video::IDeviceMemoryAllocation::EMCAF_READ);
+				auto uint16_t_ptr = static_cast<uint16_t*>(ptr);
+
+				for (uint32_t i = 0; i < 128; i++)
+				{
+					uint16_t value = uint16_t_ptr[i];
+					std::printf("%i, ", value);
+				}
+				std::printf("\n");
+				m_device->unmapMemory(mem);
+			}
+				
+			// We can also actually latch our Command Pool reset and its return to the pool of free pools!
+			m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx);
+		}
+
+		bool onAppTerminated() override
+		{
+			// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
+			// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
+			while (m_downStreamingBuffer->cull_frees()) {}
+
+			return device_base_t::onAppTerminated();
+		}
+};
+
+
+NBL_MAIN_FUNC(PropertyPoolsApp)
\ No newline at end of file
diff --git a/66_PropertyPools/pipeline.groovy b/66_PropertyPools/pipeline.groovy
new file mode 100644
index 000000000..1a7b043a4
--- /dev/null
+++ b/66_PropertyPools/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+{
+	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a20a33a9..5b104b06a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,7 @@ if(NBL_BUILD_EXAMPLES)
 	endif()
 	add_subdirectory(23_ArithmeticUnitTest EXCLUDE_FROM_ALL)
 	# add_subdirectory(23_Autoexposure EXCLUDE_FROM_ALL)
-	# add_subdirectory(25_Blur EXCLUDE_FROM_ALL)
+	add_subdirectory(26_CentralLimitBoxBlur EXCLUDE_FROM_ALL)
 	add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL)
 	# add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL)
 
@@ -65,5 +65,6 @@ if(NBL_BUILD_EXAMPLES)
 	#add_subdirectory(61_UI EXCLUDE_FROM_ALL)
 	add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
 	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
+	add_subdirectory(66_PropertyPools EXCLUDE_FROM_ALL)
 	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 endif()
\ No newline at end of file
diff --git a/common/MonoDeviceApplication.hpp b/common/MonoDeviceApplication.hpp
index b77e3442c..6a4911da7 100644
--- a/common/MonoDeviceApplication.hpp
+++ b/common/MonoDeviceApplication.hpp
@@ -18,7 +18,7 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 	public:
 		using base_t::base_t;
 
-	protected:
+	public:
 		// need this one for skipping passing all args into ApplicationFramework
 		MonoDeviceApplication() = default;