diff --git a/22.RaytracedAO/README.md b/22.RaytracedAO/README.md
index 374c16e2e..03bec2bc1 100644
--- a/22.RaytracedAO/README.md
+++ b/22.RaytracedAO/README.md
@@ -52,13 +52,25 @@ Multiple Sensor tags in mitsuba XML's is now supported. This feature helps you h
 
 You can switch between those sensors using `PAGE UP/DOWN` Keys defined in more detail below.
 
+### Properties added to \<integrator\>:
+
+| Property Name   | Description                               | Type    | Default Value  |
+|-----------------|-------------------------------------------|---------|----------------|
+| hideEnvironment | Replace bakcground with Transparent Alpha | boolean | false          |
+
+Note that we don't support Mitsuba's `hideEmitters`
+
 ### Properties added to \<sensor\>:
 
-| Property Name | Description           | Type  | Default Value                            |
-|---------------|-----------------------|-------|------------------------------------------|
-|   moveSpeed   | Camera Movement Speed | float | NaN -> Will be deduced from scene bounds |
-|   zoomSpeed   | Camera Zoom Speed     | float | NaN -> Will be deduced from scene bounds |
-|  rotateSpeed  | Camera Rotation Speed | float | 300.0                                    |
+| Property Name | Description                                                                         | Type    | Default Value                            |
+|---------------|-------------------------------------------------------------------------------------|---------|------------------------------------------|
+|       up      | Up Vector to determine roll around view axis and the north pole to rotate around    |  vector | 0.0, 1.0, 0.0                            |
+|   moveSpeed   | Camera Movement Speed                                                               |  float  | NaN -> Will be deduced from scene bounds |
+|   zoomSpeed   | Camera Zoom Speed                                                                   |  float  | NaN -> Will be deduced from scene bounds |
+|  rotateSpeed  | Camera Rotation Speed                                                               |  float  | 300.0                                    |
+| clipPlaneN\*  | Worldspace coefficients for a plane equation of the form `a*x + b*y + c*z + w >= 0` |  vector | 0.0, 0.0, 0.0, 0.0     (disabled)        |
+
+\* N ranges from 0 to 5
 
 #### Properties added to \<sensor type="perspective"\>:
 
@@ -93,6 +105,7 @@ You can switch between those sensors using `PAGE UP/DOWN` Keys defined in more d
 ```xml
 <sensor type="perspective" >
 	<float name="fov" value="60" />
+	<vector name="up" x="0" y="-0.351123" z="0.936329" />
 	<float name="moveSpeed" value="100.0" />
 	<float name="zoomSpeed" value="1.0" />
 	<float name="rotateSpeed" value="300.0" />
@@ -165,13 +178,14 @@ So the full width, height are 1152x1152 (64+1024+64=1152)
 
 | Property Name | Description                                                                                                      | Type  | Default Value |
 |---------------|------------------------------------------------------------------------------------------------------------------|-------|---------------|
-|     normalizeEnergy     | Parameter to normalize the intensity of emission profile. <br> 1) If `normalizeEnergy` is zero, it will not perform any normalization. (no normalization) <br> 2) If `normalizeEnergy` is negative, it will normalize the intensity by dividing out the maximum intensity. (normalization by max) <br> 3) If `normalizeEnergy` is positive, it will first normalize the intensity by average energy and multiply `normalizeEnergy` to intensity. (normalization by energy) | float | 0.0 (no normalization) |
-|     filename      | The filename of the IES profile. | string | ""          |
+| normalization | Parameter to normalize the intensity of emission profile. <br> 1) If `normalization` is `NONE` or invalid/none of the below, it will not perform any normalization. <br> 2) If `normalization` is `UNIT_MAX`, it will normalize the intensity by dividing out the maximum intensity. (normalization by max) <br> 3) If `normalization` is `UNIT_AVERAGE_OVER_IMPLIED_DOMAIN`, it will integrate the profile over the hemisphere as well as the solid angles where the profile has emission above 0. This has an advantage over a plain average as you don't need to care whether the light is a sphere, hemisphere, or a spotlight of a given aperture. (normalization by energy) <br> 4) If `normalization` is `UNIT_AVERAGE_OVER_FULL_DOMAIN` we behave like `UNIT_AVERAGE` but presume the soild angle of the domain is `(CIESProfile::vAngles.front()-CIESProfile::vAngles.back())*4.f`   | string | "" <br> (no normalization) |
+|   flatten     | Optional "blend" of the original profile value with the average value, if negative we use the average as if for `UNIT_AVERAGE_OVER_FULL_DOMAIN` if positive we use the average as-if for `UNIT_AVERAGE_OVER_IMPLIED_DOMAIN`.  <br> This is useful when the emitter appears "not bright enough" when observing from directions outside the main power-lobes. <br> Valid range is 0.0 to 1.0, value gets treated with `min(abs(flatten),1.f)` to make it conform. <br> A value equal to 1.0 or -1.0 will render your IES profile uniform, so its not something you should use and a warning will be emitted. | float | 0.0 |
+|   filename    | The filename of the IES profile. | string | "" |
 
 
 NOTE: **\<transform\>** tag of emitter node can be used to orient the emission direction of IES light.
 
-#### Example of Area Light with IES Profile
+#### Example of Area Light with IES Profile which flattens its profile against a full Sphere or Hemisphere average
 
 ```xml
 <emitter type="area" >
@@ -180,7 +194,8 @@ NOTE: **\<transform\>** tag of emitter node can be used to orient the emission d
 	</transform>
 	<rgb name="radiance" value="20.0"/>
 	<emissionprofile type="measured">
-		<float name="normalizeEnergy" value="42.5"/>
+		<string name="normalization" value="UNIT_AVERAGE_OVER_FULL_DOMAIN"/>
+		<float name="flatten" value="-0.1"/>
 		<string name="filename" value="emission_profile.ies"/>
 	</emissionprofile> 
 </emitter>
diff --git a/22.RaytracedAO/Renderer.cpp b/22.RaytracedAO/Renderer.cpp
index 1e4b15c8b..265f4986d 100644
--- a/22.RaytracedAO/Renderer.cpp
+++ b/22.RaytracedAO/Renderer.cpp
@@ -51,7 +51,7 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
 		m_rrManager(ext::RadeonRays::Manager::create(m_driver)),
 		m_prevView(), m_prevCamTform(), m_sceneBound(FLT_MAX,FLT_MAX,FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX), m_maxAreaLightLuma(0.f),
 		m_framesDispatched(0u), m_rcpPixelSize{0.f,0.f},
-		m_staticViewData{ {0u,0u},0u,0u,0u,0u,core::infinity<float>(),{}}, m_raytraceCommonData{0.f,0u,0u,0u,core::matrix3x4SIMD()},
+		m_staticViewData{ {0u,0u},0u,0u,0u,0u,false,core::infinity<float>(),{}}, m_raytraceCommonData{0.f,0u,0u,0u,core::matrix3x4SIMD()},
 		m_indirectDrawBuffers{nullptr},m_cullPushConstants{core::matrix4SIMD(),1.f,0u,0u,0u},m_cullWorkGroups(0u),
 		m_raygenWorkGroups{0u,0u},m_visibilityBuffer(nullptr),m_colorBuffer(nullptr),
 		m_envMapImportanceSampling(_driver)
@@ -75,6 +75,9 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
 		m_littleDownloadBuffer->getBoundMemory()->mapMemoryRange(IDriverMemoryAllocation::EMCAF_READ,{0,sizeof(uint32_t)});
 	}
 
+	// no deferral for now
+	m_fragGPUShader = gpuSpecializedShaderFromFile(m_assetManager,m_driver,"../fillVisBuffer.frag");
+
 	// set up Visibility Buffer pipeline
 	{
 		IGPUDescriptorSetLayout::SBinding binding;
@@ -98,26 +101,9 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
 		m_cullDSLayout = m_driver->createGPUDescriptorSetLayout(bindings,bindings+cullingDescriptorCount);
 	}
 	m_perCameraRasterDSLayout = core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(m_cullDSLayout);
-	{
-		core::smart_refctd_ptr<IGPUSpecializedShader> shaders[] = {gpuSpecializedShaderFromFile(m_assetManager,m_driver,"../fillVisBuffer.vert"),gpuSpecializedShaderFromFile(m_assetManager,m_driver,"../fillVisBuffer.frag")};
-		SPrimitiveAssemblyParams primitiveAssembly;
-		primitiveAssembly.primitiveType = EPT_TRIANGLE_LIST;
-		SRasterizationParams raster;
-		raster.faceCullingMode = EFCM_NONE;
-		auto _visibilityBufferFillPipelineLayout = m_driver->createGPUPipelineLayout(
-			nullptr,nullptr,
-			core::smart_refctd_ptr(m_rasterInstanceDataDSLayout),
-			core::smart_refctd_ptr(m_additionalGlobalDSLayout),
-			core::smart_refctd_ptr(m_cullDSLayout)
-		);
-		m_visibilityBufferFillPipeline = m_driver->createGPURenderpassIndependentPipeline(
-			nullptr,std::move(_visibilityBufferFillPipelineLayout),&shaders->get(),&shaders->get()+2u,
-			SVertexInputParams{},SBlendParams{},primitiveAssembly,raster
-		);
-	}
 	
 	{
-		constexpr auto raytracingCommonDescriptorCount = 10u;
+		constexpr auto raytracingCommonDescriptorCount = 11u;
 		IGPUDescriptorSetLayout::SBinding bindings[raytracingCommonDescriptorCount];
 		fillIotaDescriptorBindingDeclarations(bindings,ISpecializedShader::ESS_COMPUTE,raytracingCommonDescriptorCount);
 		bindings[0].type = asset::EDT_UNIFORM_BUFFER;
@@ -127,9 +113,10 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
 		bindings[4].type = asset::EDT_STORAGE_BUFFER;
 		bindings[5].type = asset::EDT_STORAGE_IMAGE;
 		bindings[6].type = asset::EDT_STORAGE_IMAGE;
-		bindings[7].type = asset::EDT_COMBINED_IMAGE_SAMPLER;
+		bindings[7].type = asset::EDT_STORAGE_IMAGE;
 		bindings[8].type = asset::EDT_COMBINED_IMAGE_SAMPLER;
 		bindings[9].type = asset::EDT_COMBINED_IMAGE_SAMPLER;
+		bindings[10].type = asset::EDT_COMBINED_IMAGE_SAMPLER;
 		m_commonRaytracingDSLayout = m_driver->createGPUDescriptorSetLayout(bindings,bindings+raytracingCommonDescriptorCount);
 	}
 
@@ -158,7 +145,7 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
 		m_closestHitDSLayout = m_driver->createGPUDescriptorSetLayout(bindings,bindings+closestHitDescriptorCount);
 	}
 	{
-		constexpr auto resolveDescriptorCount = 7u;
+		constexpr auto resolveDescriptorCount = 8u;
 		IGPUDescriptorSetLayout::SBinding bindings[resolveDescriptorCount];
 		fillIotaDescriptorBindingDeclarations(bindings,ISpecializedShader::ESS_COMPUTE,resolveDescriptorCount);
 		bindings[0].type = asset::EDT_UNIFORM_BUFFER;
@@ -168,9 +155,11 @@ Renderer::Renderer(IVideoDriver* _driver, IAssetManager* _assetManager, scene::I
 		bindings[2].samplers = &sampler;
 		bindings[3].type = asset::EDT_COMBINED_IMAGE_SAMPLER;
 		bindings[3].samplers = &sampler;
-		bindings[4].type = asset::EDT_STORAGE_IMAGE;
+		bindings[4].type = asset::EDT_COMBINED_IMAGE_SAMPLER;
+		bindings[4].samplers = &sampler;
 		bindings[5].type = asset::EDT_STORAGE_IMAGE;
 		bindings[6].type = asset::EDT_STORAGE_IMAGE;
+		bindings[7].type = asset::EDT_STORAGE_IMAGE;
 
 		m_resolveDSLayout = m_driver->createGPUDescriptorSetLayout(bindings,bindings+resolveDescriptorCount);
 	}
@@ -212,8 +201,11 @@ Renderer::InitializationData Renderer::initSceneObjects(const SAssetBundle& mesh
 			{
 				case Enum::DIRECT:
 					maxPathDepth = 2u;
+					hideEnvironment = integrator->direct.hideEnvironment;
 					break;
 				case Enum::PATH:
+					hideEnvironment = integrator->path.hideEnvironment;
+					[[fallthrough]];
 				case Enum::VOL_PATH_SIMPLE:
 				case Enum::VOL_PATH:
 				case Enum::BDPT:
@@ -293,7 +285,7 @@ Renderer::InitializationData Renderer::initSceneObjects(const SAssetBundle& mesh
 				// one instance data per instance of a batch
 				core::smart_refctd_ptr<ICPUBuffer> newInstanceDataBuffer;
 
-				constexpr uint16_t minTrisBatch = MAX_TRIANGLES_IN_BATCH>>1u;
+				constexpr uint16_t minTrisBatch = MAX_TRIANGLES_IN_BATCH>>3u; // allow small allocations to fight fragmentation
 				constexpr uint16_t maxTrisBatch = MAX_TRIANGLES_IN_BATCH;
 				constexpr uint8_t minVertexSize = 
 					asset::getTexelOrBlockBytesize<asset::EF_R32G32B32_SFLOAT>()+
@@ -304,8 +296,8 @@ Renderer::InitializationData Renderer::initSceneObjects(const SAssetBundle& mesh
 				constexpr uint16_t minIndicesBatch = minTrisBatch*kIndicesPerTriangle;
 
 				CPUMeshPacker::AllocationParams allocParams;
-				allocParams.vertexBuffSupportedByteSize = 1u<<31u;
-				allocParams.vertexBufferMinAllocByteSize = minTrisBatch*minVertexSize;
+				allocParams.vertexBuffSupportedByteSize = (1u<<31u)-1; // RTX cards
+				allocParams.vertexBufferMinAllocByteSize = minTrisBatch*minVertexSize; // under max vertex reuse
 				allocParams.indexBuffSupportedCnt = (allocParams.vertexBuffSupportedByteSize/allocParams.vertexBufferMinAllocByteSize)*minIndicesBatch;
 				allocParams.indexBufferMinAllocCnt = minIndicesBatch;
 				allocParams.MDIDataBuffSupportedCnt = allocParams.indexBuffSupportedCnt/minIndicesBatch;
@@ -318,7 +310,7 @@ Renderer::InitializationData Renderer::initSceneObjects(const SAssetBundle& mesh
 
 				IMeshPackerV2Base::SupportedFormatsContainer formats;
 				formats.insert(EF_R32G32B32_SFLOAT);
-				formats.insert(EF_R32G32_UINT);
+				formats.insert(EF_R32G32B32_UINT);
 				auto cpump = core::make_smart_refctd_ptr<CCPUMeshPackerV2<>>(allocParams,formats,minTrisBatch,maxTrisBatch);
 				uint32_t mdiBoundMax=0u,batchInstanceBoundTotal=0u;
 				core::vector<CPUMeshPacker::ReservedAllocationMeshBuffers> allocData;
@@ -347,20 +339,23 @@ Renderer::InitializationData Renderer::initSceneObjects(const SAssetBundle& mesh
 							assert(meshBuffer->getInstanceCount()==instanceCount);
 							// We'll disable certain attributes to ensure we only copy position, normal and uv attribute
 							SVertexInputParams& vertexInput = meshBuffer->getPipeline()->getVertexInputParams();
-							// but we'll pack normals and UVs together to save one SSBO binding (and quantize UVs to half floats)
+							// but we'll pack normals and UVs together to save one SSBO binding, but no quantization of UVs to keep accurate floating point precision for baricentrics
 							constexpr auto freeBinding = 15u;
 							vertexInput.attributes[combinedNormalUVAttributeIx].binding = freeBinding;
-							vertexInput.attributes[combinedNormalUVAttributeIx].format = EF_R32G32_UINT;
+							vertexInput.attributes[combinedNormalUVAttributeIx].format = EF_R32G32B32_UINT;
 							vertexInput.attributes[combinedNormalUVAttributeIx].relativeOffset = 0u;
 							vertexInput.enabledBindingFlags |= 0x1u<<freeBinding;
 							vertexInput.bindings[freeBinding].inputRate = EVIR_PER_VERTEX;
 							vertexInput.bindings[freeBinding].stride = 0u;
 							const auto approxVxCount = IMeshManipulator::upperBoundVertexID(meshBuffer)+meshBuffer->getBaseVertex();
+							
 							struct CombinedNormalUV
 							{
-								uint32_t nml;
-								uint16_t u,v;
+								uint32_t normal;
+								float u, v;
 							};
+							static_assert(sizeof(CombinedNormalUV) == sizeof(float) * 3u);
+
 							auto newBuff = core::make_smart_refctd_ptr<ICPUBuffer>(sizeof(CombinedNormalUV)*approxVxCount);
 							auto* dst = reinterpret_cast<CombinedNormalUV*>(newBuff->getPointer())+meshBuffer->getBaseVertex();
 							meshBuffer->setVertexBufferBinding({0u,newBuff},freeBinding);
@@ -369,11 +364,11 @@ Renderer::InitializationData Renderer::initSceneObjects(const SAssetBundle& mesh
 							vertexInput.attributes[normalAttr].format = EF_R32_UINT;
 							for (auto i=0u; i<approxVxCount; i++)
 							{
-								meshBuffer->getAttribute(&dst[i].nml,normalAttr,i);
+								meshBuffer->getAttribute(&dst[i].normal,normalAttr,i);
 								core::vectorSIMDf uv;
 								meshBuffer->getAttribute(uv,2u,i);
-								dst[i].u = core::Float16Compressor::compress(uv.x);
-								dst[i].v = core::Float16Compressor::compress(uv.y);
+								dst[i].u = uv.x;
+								dst[i].v = uv.y;
 							}
 						}
 
@@ -401,7 +396,11 @@ Renderer::InitializationData Renderer::initSceneObjects(const SAssetBundle& mesh
 
 					allocData.resize(meshBuffersToProcess.size());
 
-					cpump->alloc(allocData.data(),meshBuffersToProcess.begin(),meshBuffersToProcess.end());
+					if (!cpump->alloc(allocData.data(),meshBuffersToProcess.begin(),meshBuffersToProcess.end()))
+					{
+						printf("[ERROR] Failed to Allocate Mesh data in SSBOs, quitting!\n");
+						exit(-42);
+					}
 					cpump->shrinkOutputBuffersSize();
 					cpump->instantiateDataStorage();
 
@@ -1139,6 +1138,7 @@ void Renderer::initSceneResources(SAssetBundle& meshes, nbl::io::path&& _sampleS
 			}
 			std::cout << "\tmaxPathDepth = " << maxPathDepth << std::endl;
 			std::cout << "\tnoRussianRouletteDepth = " << noRussianRouletteDepth << std::endl;
+			std::cout << "\thideEnvironment = " << hideEnvironment << std::endl;
 			std::cout << "\tmaxSamples = " << maxSensorSamples << std::endl;
 		}
 	}
@@ -1178,7 +1178,7 @@ void Renderer::deinitSceneResources()
 	
 	m_finalEnvmap = nullptr;
 	m_envMapImportanceSampling.deinitResources();
-	m_staticViewData = {{0u,0u},0u,0u,0u,0u,core::infinity<float>(),{}};
+	m_staticViewData = {{0u,0u},0u,0u,0u,0u,false,core::infinity<float>(),{}};
 
 	auto rr = m_rrManager->getRadeonRaysAPI();
 	rr->DetachAll();
@@ -1194,6 +1194,7 @@ void Renderer::deinitSceneResources()
 
 	maxPathDepth = DefaultPathDepth;
 	noRussianRouletteDepth = 5u;
+	hideEnvironment = false;
 	maxSensorSamples = MaxFreeviewSamples;
 }
 
@@ -1223,7 +1224,8 @@ void Renderer::initScreenSizedResources(
 	int32_t cascadeCount,
 	float cascadeLuminanceBase,
 	float cascadeLuminanceStart,
-	const float Emin
+	const float Emin,
+	const nbl::core::vector<nbl::core::vectorSIMDf>& clipPlanes
 )
 {
 	float maxEmitterRadianceLuma;
@@ -1238,7 +1240,8 @@ void Renderer::initScreenSizedResources(
 	const float RGB19E7_MaxLuma = std::exp2(63.f);
 	if (cascadeCount<MinCascades) // rwmc OFF, store everything to cascade 0
 	{
-		cascadeCount = MinCascades;
+		// original idea was to create 2 cascades where the first starts so low that every sample gets added to it. But now we just do 1
+		cascadeCount = 0;
 		cascadeLuminanceBase = std::exp2(16.f); // just some constant to space the cascades apart
 		cascadeLuminanceStart = RGB19E7_MaxLuma;
 		std::cout << "Re-Weighting Monte Carlo = DISABLED" << std::endl;
@@ -1247,10 +1250,12 @@ void Renderer::initScreenSizedResources(
 	{
 		cascadeCount = core::min(cascadeCount,MaxCascades);
 		const float cascadeSegmentCount = cascadeCount-1;
-
+		// base is the power increment between each successive cascade, first cascade starts at Emin or 1/cascadeLuminanceBase^segmentCount scaled to max emitter radiance
 		const bool baseIsKnown = cascadeLuminanceBase>std::numeric_limits<float>::min();
 		if (core::isnan<float>(cascadeLuminanceStart))
 			cascadeLuminanceStart = baseIsKnown ? (maxEmitterRadianceLuma*std::pow(cascadeLuminanceBase,-cascadeSegmentCount)):Emin;
+		// rationale, we don't have NEE and BRDF importance sampling samples with throughput <= 1.0
+		// However we have RIS, and that can complicate this assumption a bit
 		if (!baseIsKnown)
 			cascadeLuminanceBase = core::max(std::pow(maxEmitterRadianceLuma/cascadeLuminanceStart,1.f/cascadeSegmentCount),1.0625f);
 		std::cout << "Re-Weighting Monte Carlo = ENABLED [cascadeCount: "<<cascadeCount<<", start: "<<cascadeLuminanceStart<<", base: "<<cascadeLuminanceBase<<"]" << std::endl;
@@ -1272,6 +1277,7 @@ void Renderer::initScreenSizedResources(
 	{
 		m_staticViewData.maxPathDepth = maxPathDepth;
 		m_staticViewData.noRussianRouletteDepth = noRussianRouletteDepth;
+		m_staticViewData.hideEnvmap = hideEnvironment;
 
 		uint32_t _maxRaysPerDispatch = 0u;
 		auto setRayBufferSizes = [renderPixelCount,this,&_maxRaysPerDispatch,&raygenBufferSize,&intersectionBufferSize](uint32_t sampleMultiplier) -> void
@@ -1299,6 +1305,17 @@ void Renderer::initScreenSizedResources(
 	m_staticViewData.sampleSequenceStride = SampleSequence::computeQuantizedDimensions(maxPathDepth);
 	auto stream = std::ofstream("runtime_defines.glsl");
 
+	for (auto i=0; i<ext::MitsubaLoader::CElementSensor::MaxClipPlanes; i++)
+	{
+		if (i<clipPlanes.size())
+		{
+			glEnable(GL_CLIP_DISTANCE0+i);
+			stream << "#define CLIP_PLANE_" << i << " vec4(" << clipPlanes[i].x << "," << clipPlanes[i].y << "," << clipPlanes[i].z << "," << clipPlanes[i].w << ")\n";
+		}
+		else
+			glDisable(GL_CLIP_DISTANCE0+i);
+	}
+
 	stream << "#define _NBL_EXT_MITSUBA_LOADER_VT_STORAGE_VIEW_COUNT " << m_globalMeta->m_global.getVTStorageViewCount() << "\n"
 		<< m_globalMeta->m_global.m_materialCompilerGLSL_declarations
 		<< "#ifndef MAX_RAYS_GENERATED\n"
@@ -1315,6 +1332,9 @@ void Renderer::initScreenSizedResources(
 		// cull
 		m_cullGPUShader = gpuSpecializedShaderFromFile(m_assetManager,m_driver,"../cull.comp");
 
+		// visbuffer
+		m_vertGPUShader = gpuSpecializedShaderFromFile(m_assetManager, m_driver, "../fillVisBuffer.vert");
+
 		// raygen
 		m_raygenGPUShader = gpuSpecializedShaderFromFile(m_assetManager,m_driver,"../raygen.comp");
 
@@ -1360,7 +1380,7 @@ void Renderer::initScreenSizedResources(
 		if (static_cast<COpenGLDriver*>(m_driver)->runningInRenderdoc()) // makes Renderdoc capture the modifications done by OpenCL
 		{
 			interopBuffer.buffer = m_driver->createUpStreamingGPUBufferOnDedMem(size);
-			//interopBuffer.buffer->getBoundMemory()->mapMemoryRange(IDriverMemoryAllocation::EMCAF_WRITE,{0u,size})
+//			interopBuffer.buffer->getBoundMemory()->mapMemoryRange(IDriverMemoryAllocation::EMCAF_READ_AND_WRITE,{0u,size});
 		}
 		else
 			interopBuffer.buffer = m_driver->createDeviceLocalGPUBufferOnDedMem(size);
@@ -1388,11 +1408,12 @@ void Renderer::initScreenSizedResources(
 	m_accumulation = createScreenSizedTexture(EF_R32G32_UINT,(cascadeCount+1u)*m_staticViewData.samplesPerPixelPerDispatch); // one more (first) layer because of accumulation metadata for a path
 	m_albedoAcc = createScreenSizedTexture(EF_R32_UINT,m_staticViewData.samplesPerPixelPerDispatch);
 	m_normalAcc = createScreenSizedTexture(EF_R32_UINT,m_staticViewData.samplesPerPixelPerDispatch);
+	m_maskAcc = createScreenSizedTexture(EF_R16_UNORM,m_staticViewData.samplesPerPixelPerDispatch);
 	m_tonemapOutput = createScreenSizedTexture(EF_R16G16B16A16_SFLOAT);
 	m_albedoRslv = createScreenSizedTexture(EF_A2B10G10R10_UNORM_PACK32);
 	m_normalRslv = createScreenSizedTexture(EF_R16G16B16A16_SFLOAT);
 
-	constexpr uint32_t MaxDescritorUpdates = 10u;
+	constexpr uint32_t MaxDescritorUpdates = 11u;
 	IGPUDescriptorSet::SDescriptorInfo infos[MaxDescritorUpdates];
 	IGPUDescriptorSet::SWriteDescriptorSet writes[MaxDescritorUpdates];
 	
@@ -1409,20 +1430,21 @@ void Renderer::initScreenSizedResources(
 		setImageInfo(infos+2,asset::EIL_GENERAL,core::smart_refctd_ptr(m_accumulation));
 		setImageInfo(infos+5,asset::EIL_GENERAL,core::smart_refctd_ptr(m_albedoAcc));
 		setImageInfo(infos+6,asset::EIL_GENERAL,core::smart_refctd_ptr(m_normalAcc));
+		setImageInfo(infos+7,asset::EIL_GENERAL,core::smart_refctd_ptr(m_maskAcc));
 
 		// envmap
 		{
-			setImageInfo(infos+7,asset::EIL_GENERAL,core::smart_refctd_ptr(m_finalEnvmap));
+			setImageInfo(infos+8,asset::EIL_GENERAL,core::smart_refctd_ptr(m_finalEnvmap));
 			ISampler::SParams samplerParams = { ISampler::ETC_REPEAT, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_FLOAT_OPAQUE_BLACK, ISampler::ETF_LINEAR, ISampler::ETF_LINEAR, ISampler::ESMM_LINEAR, 0u, false, ECO_ALWAYS };
-			infos[7].image.sampler = m_driver->createGPUSampler(samplerParams);
-			infos[7].image.imageLayout = EIL_SHADER_READ_ONLY_OPTIMAL;
+			infos[8].image.sampler = m_driver->createGPUSampler(samplerParams);
+			infos[8].image.imageLayout = EIL_SHADER_READ_ONLY_OPTIMAL;
 		}
 		// warpmap
 		{
-			setImageInfo(infos+8,asset::EIL_GENERAL,core::smart_refctd_ptr(warpMap));
+			setImageInfo(infos+9,asset::EIL_GENERAL,core::smart_refctd_ptr(warpMap));
 			ISampler::SParams samplerParams = { ISampler::ETC_REPEAT, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_FLOAT_OPAQUE_BLACK, ISampler::ETF_LINEAR, ISampler::ETF_LINEAR, ISampler::ESMM_LINEAR, 0u, false, ECO_ALWAYS };
-			infos[8].image.sampler = m_driver->createGPUSampler(samplerParams);
-			infos[8].image.imageLayout = EIL_SHADER_READ_ONLY_OPTIMAL;
+			infos[9].image.sampler = m_driver->createGPUSampler(samplerParams);
+			infos[9].image.imageLayout = EIL_SHADER_READ_ONLY_OPTIMAL;
 		}
 		
 		IGPUDescriptorSet::SDescriptorInfo luminanceDescriptorInfo = {};
@@ -1442,7 +1464,7 @@ void Renderer::initScreenSizedResources(
 		for (auto i=0u; i<2u; i++)
 			m_commonRaytracingDS[i] = m_driver->createGPUDescriptorSet(core::smart_refctd_ptr(m_commonRaytracingDSLayout));
 
-		constexpr auto descriptorUpdateCount = 10u;
+		constexpr auto descriptorUpdateCount = 11u;
 		setDstSetAndDescTypesOnWrites(m_commonRaytracingDS[0].get(),writes,infos,{
 			EDT_UNIFORM_BUFFER,
 			EDT_UNIFORM_TEXEL_BUFFER,
@@ -1451,17 +1473,18 @@ void Renderer::initScreenSizedResources(
 			EDT_STORAGE_BUFFER,
 			EDT_STORAGE_IMAGE,
 			EDT_STORAGE_IMAGE,
+			EDT_STORAGE_IMAGE,
 			EDT_COMBINED_IMAGE_SAMPLER,
 			EDT_COMBINED_IMAGE_SAMPLER,
 		});
 		
 		// Set last write
-		writes[9].binding = 9u;
-		writes[9].arrayElement = 0u;
-		writes[9].count = 1u;
-		writes[9].descriptorType = EDT_COMBINED_IMAGE_SAMPLER;
-		writes[9].dstSet = m_commonRaytracingDS[0].get();
-		writes[9].info = &luminanceDescriptorInfo;
+		writes[10].binding = 10u;
+		writes[10].arrayElement = 0u;
+		writes[10].count = 1u;
+		writes[10].descriptorType = EDT_COMBINED_IMAGE_SAMPLER;
+		writes[10].dstSet = m_commonRaytracingDS[0].get();
+		writes[10].info = &luminanceDescriptorInfo;
 
 		m_driver->updateDescriptorSets(descriptorUpdateCount,writes,0u,nullptr);
 		// set up second DS
@@ -1538,23 +1561,24 @@ void Renderer::initScreenSizedResources(
 		}
 		setImageInfo(infos+2,asset::EIL_GENERAL,std::move(albedoSamplerView));
 		setImageInfo(infos+3,asset::EIL_GENERAL,core::smart_refctd_ptr(m_normalAcc));
-		setImageInfo(infos+4,asset::EIL_GENERAL,core::smart_refctd_ptr(m_tonemapOutput));
+		setImageInfo(infos+4,asset::EIL_GENERAL,core::smart_refctd_ptr(m_maskAcc));
+		setImageInfo(infos+5,asset::EIL_GENERAL,core::smart_refctd_ptr(m_tonemapOutput));
 		core::smart_refctd_ptr<IGPUImageView> albedoStorageView;
 		{
 			IGPUImageView::SCreationParams viewparams = m_albedoRslv->getCreationParameters();
 			viewparams.format = EF_R32_UINT;
 			albedoStorageView = m_driver->createGPUImageView(std::move(viewparams));
 		}
-		setImageInfo(infos+5,asset::EIL_GENERAL,std::move(albedoStorageView));
-		setImageInfo(infos+6,asset::EIL_GENERAL,core::smart_refctd_ptr(m_normalRslv));
+		setImageInfo(infos+6,asset::EIL_GENERAL,std::move(albedoStorageView));
+		setImageInfo(infos+7,asset::EIL_GENERAL,core::smart_refctd_ptr(m_normalRslv));
 				
 		setDstSetAndDescTypesOnWrites(m_resolveDS.get(),writes,infos,{
 			EDT_UNIFORM_BUFFER,
-			EDT_COMBINED_IMAGE_SAMPLER,EDT_COMBINED_IMAGE_SAMPLER,EDT_COMBINED_IMAGE_SAMPLER,
+			EDT_COMBINED_IMAGE_SAMPLER,EDT_COMBINED_IMAGE_SAMPLER,EDT_COMBINED_IMAGE_SAMPLER,EDT_COMBINED_IMAGE_SAMPLER,
 			EDT_STORAGE_IMAGE,EDT_STORAGE_IMAGE,EDT_STORAGE_IMAGE
 		});
 	}
-	m_driver->updateDescriptorSets(7u,writes,0u,nullptr);
+	m_driver->updateDescriptorSets(8u,writes,0u,nullptr);
 
 	m_visibilityBuffer = m_driver->addFrameBuffer();
 	m_visibilityBuffer->attach(EFAP_DEPTH_ATTACHMENT,createScreenSizedTexture(EF_D32_SFLOAT));
@@ -1602,6 +1626,7 @@ void Renderer::deinitScreenSizedResources()
 	m_accumulation = m_tonemapOutput = nullptr;
 	m_albedoAcc = m_albedoRslv = nullptr;
 	m_normalAcc = m_normalRslv = nullptr;
+	m_maskAcc = nullptr;
 
 	glFinish();
 	
@@ -1631,6 +1656,7 @@ void Renderer::deinitScreenSizedResources()
 	m_staticViewData.maxPathDepth = DefaultPathDepth;
 	m_staticViewData.noRussianRouletteDepth = 5u;
 	m_staticViewData.samplesPerPixelPerDispatch = 1u;
+	m_staticViewData.hideEnvmap = false;
 	m_staticViewData.envMapPDFNormalizationFactor = core::infinity<float>();
 	m_staticViewData.cascadeParams = {};
 	m_totalRaysCast = 0ull;
@@ -1851,7 +1877,24 @@ bool Renderer::render(nbl::ITimer* timer, const float kappa, const float Emin, c
 		bool compiledShaders = compileShadersFuture.get();
 		if(compiledShaders)
 		{
-			m_cullPipeline = m_driver->createGPUComputePipeline(nullptr,core::smart_refctd_ptr(m_cullPipelineLayout), core::smart_refctd_ptr(m_cullGPUShader));
+			m_cullPipeline = m_driver->createGPUComputePipeline(nullptr,core::smart_refctd_ptr(m_cullPipelineLayout), core::smart_refctd_ptr(m_cullGPUShader));	
+			{
+				IGPUSpecializedShader* shaders[] = {m_vertGPUShader.get(),m_fragGPUShader.get()};
+				SPrimitiveAssemblyParams primitiveAssembly;
+				primitiveAssembly.primitiveType = EPT_TRIANGLE_LIST;
+				SRasterizationParams raster;
+				raster.faceCullingMode = EFCM_NONE;
+				auto _visibilityBufferFillPipelineLayout = m_driver->createGPUPipelineLayout(
+					nullptr,nullptr,
+					core::smart_refctd_ptr(m_rasterInstanceDataDSLayout),
+					core::smart_refctd_ptr(m_additionalGlobalDSLayout),
+					core::smart_refctd_ptr(m_cullDSLayout)
+				);
+				m_visibilityBufferFillPipeline = m_driver->createGPURenderpassIndependentPipeline(
+					nullptr,std::move(_visibilityBufferFillPipelineLayout),shaders,shaders+2u,
+					SVertexInputParams{},SBlendParams{},primitiveAssembly,raster
+				);
+			}
 			m_raygenPipeline = m_driver->createGPUComputePipeline(nullptr,core::smart_refctd_ptr(m_raygenPipelineLayout), core::smart_refctd_ptr(m_raygenGPUShader));
 			m_closestHitPipeline = m_driver->createGPUComputePipeline(nullptr,core::smart_refctd_ptr(m_closestHitPipelineLayout), core::smart_refctd_ptr(m_closestHitGPUShader));
 			m_resolvePipeline = m_driver->createGPUComputePipeline(nullptr,core::smart_refctd_ptr(m_resolvePipelineLayout), core::smart_refctd_ptr(m_resolveGPUShader));
@@ -2095,7 +2138,17 @@ bool Renderer::traceBounce(uint32_t& raycount)
 				std::cout << "[ERROR] RadeonRays Timed Out" << std::endl;
 				return false;
 			}
+
+			if (static_cast<COpenGLDriver*>(m_driver)->runningInRenderdoc())
+			{
+				auto touchAllBytes = [](IGPUBuffer* buf)->void
+				{
+					auto ptr = reinterpret_cast<uint8_t*>(buf->getBoundMemory()->getMappedPointer());
+				};
+				touchAllBytes(m_intersectionBuffer[descSetIx].buffer.get());
+			}
 		}
+
 	
 		// compute bounce (accumulate contributions and optionally generate rays)
 		{
diff --git a/22.RaytracedAO/Renderer.h b/22.RaytracedAO/Renderer.h
index 912eadd7b..5c8e45738 100644
--- a/22.RaytracedAO/Renderer.h
+++ b/22.RaytracedAO/Renderer.h
@@ -55,7 +55,8 @@ class Renderer : public nbl::core::IReferenceCounted, public nbl::core::Interfac
 			int32_t cascadeCount,
 			float cascadeLuminanceBase,
 			float cascadeLuminanceStart,
-			const float Emin
+			const float Emin,
+			const nbl::core::vector<nbl::core::vectorSIMDf>& clipPlanes={}
 		);
 
 		void deinitScreenSizedResources();
@@ -180,6 +181,7 @@ class Renderer : public nbl::core::IReferenceCounted, public nbl::core::Interfac
 		nbl::core::smart_refctd_ptr<nbl::video::IGPUPipelineLayout> m_resolvePipelineLayout;
 		
 		nbl::core::smart_refctd_ptr<IGPUSpecializedShader> m_cullGPUShader;
+		nbl::core::smart_refctd_ptr<IGPUSpecializedShader> m_vertGPUShader,m_fragGPUShader;
 		nbl::core::smart_refctd_ptr<IGPUSpecializedShader> m_raygenGPUShader;
 		nbl::core::smart_refctd_ptr<IGPUSpecializedShader> m_closestHitGPUShader;
 		nbl::core::smart_refctd_ptr<IGPUSpecializedShader> m_resolveGPUShader;
@@ -207,7 +209,8 @@ class Renderer : public nbl::core::IReferenceCounted, public nbl::core::Interfac
 				nbl::core::smart_refctd_ptr<nbl::video::IGPUBufferView> bufferView;
 		} sampleSequence;
 		uint16_t maxPathDepth;
-		uint16_t noRussianRouletteDepth;
+		uint16_t noRussianRouletteDepth : 15;
+		uint16_t hideEnvironment : 1;
 		uint32_t maxSensorSamples;
 
 		// scene specific data
@@ -254,6 +257,7 @@ class Renderer : public nbl::core::IReferenceCounted, public nbl::core::Interfac
 		nbl::core::smart_refctd_ptr<nbl::video::IGPUImageView> m_accumulation,m_tonemapOutput;
 		nbl::core::smart_refctd_ptr<nbl::video::IGPUImageView> m_albedoAcc,m_albedoRslv;
 		nbl::core::smart_refctd_ptr<nbl::video::IGPUImageView> m_normalAcc,m_normalRslv;
+		nbl::core::smart_refctd_ptr<nbl::video::IGPUImageView> m_maskAcc;
 		nbl::video::IFrameBuffer* m_visibilityBuffer,* m_colorBuffer;
 		
 		// Resources used for envmap sampling
diff --git a/22.RaytracedAO/closestHit.comp b/22.RaytracedAO/closestHit.comp
index a801b48d3..85d746b54 100644
--- a/22.RaytracedAO/closestHit.comp
+++ b/22.RaytracedAO/closestHit.comp
@@ -71,9 +71,10 @@ void main()
 			
 			// positions
 			const vec3 lastVxPos = load_positions(batchInstanceData,indices);
+			if (!bool(batchInstanceData.determinantSignBit&0x80000000u))
+				normalizedG = -normalizedG;
+			const bool frontfacing = dot(normalizedV,normalizedG)>=0.f;
 
-			const bool frontfacing = bool((batchInstanceData.determinantSignBit^floatBitsToUint(dot(normalizedV,normalizedG)))&0x80000000u);
-							
 			// get material
 			const nbl_glsl_MC_oriented_material_t material = nbl_glsl_MC_material_data_t_getOriented(batchInstanceData.material,frontfacing);
 			contrib.color = contrib.albedo = nbl_glsl_MC_oriented_material_t_getEmissive(material, normalizedV);
@@ -97,7 +98,7 @@ void main()
 				);
 
 				const vec3 origin = dPdBary*compactBary+lastVxPos;
-				generate_next_rays(
+				rayMask = generate_next_rays(
 					MAX_RAYS_GENERATED,material,frontfacing,vertex_depth,
 					scramble_start_state,sampleID,outPixelLocation,origin,
 					throughput,aovThroughputScale,contrib.albedo,contrib.worldspaceNormal
@@ -117,22 +118,42 @@ void main()
 		contrib.color *= throughput;
 		const vec3 aovThroughput = throughput*aovThroughputScale;
 		//
-		const bool pathToBeContinued = bool(rayMask);
-		if (pathToBeContinued)
-			addAccumulation(contrib.color,accumulationLocation);
-		else
+		if (isRWMCEnabled())
 		{
-			// need whole path throughput when splatting
-			contrib.color += fetchAccumulation(accumulationLocation);
-			const nbl_glsl_RWMC_SplattingParameters splat = nbl_glsl_RWMC_getCascade(staticViewData.cascadeParams,nbl_glsl_MC_colorToScalar(contrib.color)/pc.cummon.rcpFramesDispatched);
-			for (uint j=0u; j<2u; j++)
-				addAccumulationCascade(
-					contrib.color*splat.cascadeWeights[j],accumulationLocation,
-					samplesPerPixelPerDispatch,splat.lowerCascade+j
-				);
+			const bool pathToBeContinued = bool(rayMask);
+			if (pathToBeContinued)
+				addAccumulation(contrib.color,accumulationLocation);
+			else
+			{
+				// need whole path throughput when splatting
+				contrib.color += fetchAccumulation(accumulationLocation);
+				const nbl_glsl_RWMC_SplattingParameters splat = nbl_glsl_RWMC_getCascade(staticViewData.cascadeParams,nbl_glsl_MC_colorToScalar(contrib.color)/pc.cummon.rcpFramesDispatched);
+				for (uint j=0u; j<2u; j++)
+					addAccumulationCascade(
+						contrib.color*splat.cascadeWeights[j],accumulationLocation,
+						samplesPerPixelPerDispatch,splat.lowerCascade+j
+					);
+			}
 		}
+		else
+			addAccumulation(contrib.color,accumulationLocation);
 		//
 		addAlbedo(contrib.albedo*aovThroughput,accumulationLocation);
 		addWorldspaceNormal(contrib.worldspaceNormal*nbl_glsl_MC_colorToScalar(aovThroughput),accumulationLocation);
+		// only misses contribute to transparency
+		if (bool(staticViewData.sampleSequenceStride_hideEnvmap>>31))
+		{
+			float mask = 0.f;
+			if (!hit)
+			{
+				// make the luma of throughput dictate transparency
+				mask = dot(aovThroughput,transpose(nbl_glsl_sRGBtoXYZ)[1]);
+				// only count transmissions
+				const vec2 texCoordUV = (vec2(accumulationLocation.xy)+vec2(0.5))/vec2(getImageDimensions(staticViewData));
+				const vec3 seeThroughDir = normalize(mat3(pc.cummon.viewDirReconFactors)*vec3(texCoordUV,1.f));
+				mask *= pow(max(dot(normalizedV,seeThroughDir),0.f),1024.f);
+			}
+			addMask(mask,accumulationLocation);
+		}
 	}
 }
\ No newline at end of file
diff --git a/22.RaytracedAO/fillVisBuffer.frag b/22.RaytracedAO/fillVisBuffer.frag
index 9bce3dc26..88a18455a 100644
--- a/22.RaytracedAO/fillVisBuffer.frag
+++ b/22.RaytracedAO/fillVisBuffer.frag
@@ -29,7 +29,7 @@ void main()
     vec2 bary = nbl_glsl_barycentric_frag_get();
 
     const int triangleIDBitcount = findMSB(MAX_TRIANGLES_IN_BATCH-1)+1;
-	frontFacingTriangleIDDrawID_unorm16Bary_dBarydScreenHalf2x2[0] = bitfieldInsert(BackfacingBit_BatchInstanceGUID,gl_PrimitiveID,31-triangleIDBitcount,triangleIDBitcount)^(gl_FrontFacing ? 0x0u:0x80000000u);
+	frontFacingTriangleIDDrawID_unorm16Bary_dBarydScreenHalf2x2[0] = bitfieldInsert(BackfacingBit_BatchInstanceGUID,gl_PrimitiveID,31-triangleIDBitcount,triangleIDBitcount);
     frontFacingTriangleIDDrawID_unorm16Bary_dBarydScreenHalf2x2[1] = packUnorm2x16(bary);
     frontFacingTriangleIDDrawID_unorm16Bary_dBarydScreenHalf2x2[2] = packHalf2x16(dFdx(bary));
     frontFacingTriangleIDDrawID_unorm16Bary_dBarydScreenHalf2x2[3] = packHalf2x16(dFdy(bary));
diff --git a/22.RaytracedAO/fillVisBuffer.vert b/22.RaytracedAO/fillVisBuffer.vert
index 6c9279e02..eca6aa925 100644
--- a/22.RaytracedAO/fillVisBuffer.vert
+++ b/22.RaytracedAO/fillVisBuffer.vert
@@ -10,6 +10,8 @@
 #define _NBL_GLSL_EXT_MITSUBA_LOADER_INSTANCE_DATA_BINDING_ 0
 #include "virtualGeometry.glsl"
 
+#include "runtime_defines.glsl"
+
 layout(set=2, binding=0, row_major) readonly restrict buffer PerInstancePerCamera
 {
     DrawData_t data[];
@@ -31,4 +33,25 @@ void main()
     const vec3 modelPos = nbl_glsl_fetchVtxPos(gl_VertexIndex,InstData.data[batchInstanceGUID]);
     nbl_glsl_barycentric_vert_set(modelPos);
     gl_Position = nbl_glsl_pseudoMul4x4with3x1(self.MVP,modelPos);
+
+    // clipping
+#ifdef CLIP_PLANE_0 
+    const vec4 worldPos = vec4(nbl_glsl_pseudoMul3x4with3x1(InstData.data[batchInstanceGUID].tform,modelPos),1.0);
+    gl_ClipDistance[0] = dot(CLIP_PLANE_0,worldPos);
+#ifdef CLIP_PLANE_1
+    gl_ClipDistance[1] = dot(CLIP_PLANE_1,worldPos);
+#ifdef CLIP_PLANE_2
+    gl_ClipDistance[2] = dot(CLIP_PLANE_2,worldPos);
+#ifdef CLIP_PLANE_3
+    gl_ClipDistance[3] = dot(CLIP_PLANE_3,worldPos);
+#ifdef CLIP_PLANE_4
+    gl_ClipDistance[4] = dot(CLIP_PLANE_4,worldPos);
+#ifdef CLIP_PLANE_5
+    gl_ClipDistance[5] = dot(CLIP_PLANE_5,worldPos);
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
 }
diff --git a/22.RaytracedAO/main.cpp b/22.RaytracedAO/main.cpp
index 5c44b2396..855a6ac63 100644
--- a/22.RaytracedAO/main.cpp
+++ b/22.RaytracedAO/main.cpp
@@ -60,6 +60,9 @@ class RaytracerExampleEventReceiver : public nbl::IEventReceiver
 					case ReloadKey:
 						reloadKeyPressed = true;
 						break;
+					case OverloadCameraKey:
+						overloadCameraKeyPressed = true;
+						break;
 					case QuitKey:
 						running = false;
 						return true;
@@ -89,6 +92,8 @@ class RaytracerExampleEventReceiver : public nbl::IEventReceiver
 
 		inline bool isReloadKeyPressed() const { return reloadKeyPressed; }
 
+		inline bool isOverloadCameraKeyPressed() const { return overloadCameraKeyPressed; }
+
 		inline void resetKeys()
 		{
 			skipKeyPressed = false;
@@ -98,6 +103,7 @@ class RaytracerExampleEventReceiver : public nbl::IEventReceiver
 			screenshotKeyPressed = false;
 			logProgressKeyPressed = false;
 			reloadKeyPressed = false;
+			overloadCameraKeyPressed = false;
 		}
 
 	private:
@@ -110,6 +116,7 @@ class RaytracerExampleEventReceiver : public nbl::IEventReceiver
 		static constexpr nbl::EKEY_CODE LogProgressKey = nbl::KEY_KEY_L;
 		static constexpr nbl::EKEY_CODE BeautyKey = nbl::KEY_KEY_B;
 		static constexpr nbl::EKEY_CODE ReloadKey = nbl::KEY_F5;
+		static constexpr nbl::EKEY_CODE OverloadCameraKey = nbl::KEY_KEY_C;
 
 		bool running;
 		bool renderingBeauty;
@@ -121,6 +128,7 @@ class RaytracerExampleEventReceiver : public nbl::IEventReceiver
 		bool screenshotKeyPressed;
 		bool logProgressKeyPressed;
 		bool reloadKeyPressed;
+		bool overloadCameraKeyPressed;
 };
 
 struct PersistentState
@@ -264,7 +272,14 @@ int main(int argc, char** argv)
 		for (auto i = 1ul; i < argc; ++i)
 			arguments.emplace_back(argv[i]);
 	}
-	
+	std::cout << std::endl;
+	std::cout << "-- Build URL:" << std::endl;
+	std::cout << NBL_BUILD_URL << std::endl;
+	std::cout << std::endl;
+	std::cout << "-- Build log:" << std::endl;
+	std::cout << NBL_GIT_LOG << std::endl;
+	std::cout  << std::endl;
+
 	bool applicationIsReloaded = false;
 	PersistentState applicationState;
 	{
@@ -536,6 +551,7 @@ int main(int argc, char** argv)
 		float Emin = 0.05f;
 		bool envmap = false;
 		float envmapRegFactor = 0.0f;
+		core::vector<core::vectorSIMDf> clipPlanes;
 
 		scene::CSceneNodeAnimatorCameraModifiedMaya* getInteractiveCameraAnimator()
 		{
@@ -614,8 +630,8 @@ int main(int argc, char** argv)
 	};
 	
 	const bool shouldHaveSensorIdxInFileName = globalMeta->m_global.m_sensors.size() > 1;
-	std::vector<SensorData> sensors = std::vector<SensorData>();
-	std::vector<CubemapRender> cubemapRenders = std::vector<CubemapRender>();
+	std::vector<SensorData> sensors;
+	std::vector<CubemapRender> cubemapRenders;
 
 	auto extractAndAddToSensorData = [&](const ext::MitsubaLoader::CElementSensor& sensor, uint32_t idx) -> bool
 	{
@@ -683,6 +699,17 @@ int main(int argc, char** argv)
 				return false;
 		}
 		mainSensorData.type = sensor.type;
+		
+		for (auto i=0; i<sensor.MaxClipPlanes; i++)
+		{
+			const auto& plane = cameraBase->clipPlanes[i];
+			if ((plane!=core::vectorSIMDf()).any())
+			{
+				mainSensorData.clipPlanes.push_back(plane);
+				printf("Found Clip Plane %f,%f,%f,%f\n",plane[0],plane[1],plane[2],plane[3]);
+			}
+		}
+
 		mainSensorData.rotateSpeed = cameraBase->rotateSpeed;
 		mainSensorData.stepZoomSpeed = cameraBase->zoomSpeed;
 		mainSensorData.moveSpeed = cameraBase->moveSpeed;
@@ -753,6 +780,9 @@ int main(int argc, char** argv)
 			auto tpose = core::transpose(core::matrix4SIMD(relativeTransform));
 			mainCamUp = tpose.rows[1];
 			mainCamView = tpose.rows[2];
+
+			std::cout << "\t Camera Reconstructed UpVector = <" << mainCamUp.x << "," << mainCamUp.y << "," << mainCamUp.z << ">" << std::endl;
+			std::cout << "\t Camera Reconstructed Forward = <" << mainCamView.x << "," << mainCamView.y << "," << mainCamView.z << ">" << std::endl;
 		}
 		
 		float realFoVDegrees;
@@ -874,8 +904,19 @@ int main(int argc, char** argv)
 				staticCamera->setTarget(target.getAsVector3df());
 			}
 
-			if (core::dot(core::normalize(core::cross(staticCamera->getUpVector(),mainCamView)),core::cross(mainCamUp,mainCamView)).x<0.99f)
-				staticCamera->setUpVector(mainCamUp);
+			{
+				auto declaredUp = cameraBase->up;
+				auto reconstructedRight = core::cross(declaredUp,mainCamView);
+				auto actualRight = core::cross(mainCamUp,mainCamView);
+				// special formulation avoiding multiple sqrt and inversesqrt to preserve precision
+				const float dp = core::dot(reconstructedRight,actualRight).x/core::sqrt((core::dot(reconstructedRight,reconstructedRight)*core::dot(actualRight,actualRight)).x);
+				const float pb = core::dot(declaredUp,mainCamView).x/core::sqrt((core::dot(declaredUp,declaredUp)*core::dot(mainCamView,mainCamView)).x);
+				std::cout << "\t Camera Reconstructed UpVector match score = "<< dp << std::endl;
+				if (dp>0.97f && dp<1.03f && abs(pb)<0.9996f)
+					staticCamera->setUpVector(declaredUp);
+				else
+					staticCamera->setUpVector(mainCamUp);
+			}
 
 			//
 			if (ortho)
@@ -1059,7 +1100,7 @@ int main(int argc, char** argv)
 			if(needsReinit) 
 			{
 				renderer->deinitScreenSizedResources();
-				renderer->initScreenSizedResources(sensor.width,sensor.height,sensor.envmapRegFactor,sensor.cascadeCount,sensor.cascadeLuminanceBase,sensor.cascadeLuminanceStart,sensor.Emin);
+				renderer->initScreenSizedResources(sensor.width,sensor.height,sensor.envmapRegFactor,sensor.cascadeCount,sensor.cascadeLuminanceBase,sensor.cascadeLuminanceStart,sensor.Emin,sensor.clipPlanes);
 			}
 		
 			smgr->setActiveCamera(sensor.staticCamera);
@@ -1194,7 +1235,7 @@ int main(int argc, char** argv)
 				{
 					renderer->deinitScreenSizedResources();
 					const auto& sensorData = sensors[activeSensor];
-					renderer->initScreenSizedResources(sensorData.width,sensorData.height,sensorData.envmapRegFactor,sensorData.cascadeCount,sensorData.cascadeLuminanceBase,sensorData.cascadeLuminanceStart,sensorData.Emin);
+					renderer->initScreenSizedResources(sensorData.width,sensorData.height,sensorData.envmapRegFactor,sensorData.cascadeCount,sensorData.cascadeLuminanceBase,sensorData.cascadeLuminanceStart,sensorData.Emin,sensorData.clipPlanes);
 				}
 
 				smgr->setActiveCamera(sensors[activeSensor].interactiveCamera);
@@ -1217,12 +1258,39 @@ int main(int argc, char** argv)
 					sensors[activeSensor].resetInteractiveCamera();
 					std::cout << "Interactive Camera Position and Target has been Reset." << std::endl;
 				}
-				if(receiver.isNextPressed())
+				else if(receiver.isOverloadCameraKeyPressed())
+				{
+					pfd::open_file file("Choose XML file to overload camera with (only first sensor overrides)", "../../media/mitsuba", { "XML files (.xml)", "*.xml" });
+					if (!file.result().empty())
+					{
+						const auto filePath = file.result()[0];
+						using namespace nbl::asset;
+						smart_refctd_ptr<const ext::MitsubaLoader::CMitsubaMetadata> mitsubaMetadata;
+						{
+							static const IAssetLoader::SAssetLoadParams mitsubaLoaderParams = { 0, nullptr, IAssetLoader::ECF_DONT_CACHE_REFERENCES, nullptr, IAssetLoader::ELPF_LOAD_METADATA_ONLY };
+							auto meshes_bundle = device->getAssetManager()->getAsset(filePath.data(),mitsubaLoaderParams);
+							if (!meshes_bundle.getContents().empty())
+								mitsubaMetadata = smart_refctd_ptr<const ext::MitsubaLoader::CMitsubaMetadata>(static_cast<const ext::MitsubaLoader::CMitsubaMetadata*>(meshes_bundle.getMetadata()));
+						}
+						if (!mitsubaMetadata || mitsubaMetadata->m_global.m_sensors.empty())
+							os::Printer::log("ERROR (" + std::to_string(__LINE__) + " line): The xml file is invalid/cannot be loaded! File path: " + filePath, ELL_ERROR);
+						else
+						{
+							const uint32_t originalSensorCount = sensors.size();
+							uint32_t idx = originalSensorCount;
+							for (const auto& sensor : mitsubaMetadata->m_global.m_sensors)
+								extractAndAddToSensorData(sensor,idx++);
+							setActiveSensor(originalSensorCount);
+						}
+						writeLastRunState = true;
+					}
+				}
+				else if(receiver.isNextPressed())
 				{
 					setActiveSensor(activeSensor + 1);
 					writeLastRunState = true;
 				}
-				if(receiver.isPreviousPressed())
+				else if(receiver.isPreviousPressed())
 				{
 					setActiveSensor(activeSensor - 1);
 					writeLastRunState = true;
@@ -1322,7 +1390,8 @@ int main(int argc, char** argv)
 				auto samples = renderer->getTotalSamplesComputed();
 				auto rays = renderer->getTotalRaysCast();
 				const double microsecondsElapsed = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now()-start).count();
-				str << L"Raytraced Shadows Demo - Nabla Engine   MegaSamples: " << samples/1000000ull
+				str << L"Nabla Path Tracer: " << applicationState.zipPath.c_str() << "\\" << applicationState.xmlPath.c_str()
+					<< "   MegaSamples: " << samples/1000000ull
 					<< "   MSample/s: " << double(samples)/microsecondsElapsed
 					<< "   MRay/s: " << double(rays)/microsecondsElapsed;
 
diff --git a/22.RaytracedAO/raygen.comp b/22.RaytracedAO/raygen.comp
index 71e6ff7f2..2b45dec4d 100644
--- a/22.RaytracedAO/raygen.comp
+++ b/22.RaytracedAO/raygen.comp
@@ -41,11 +41,10 @@ void main()
 		if (hit)
 		{
 			// vis buffer decode
-			const bool frontfacing = !bool(visBuffer[0]&0x80000000u);
 			const int triangleIDBitcount = findMSB(MAX_TRIANGLES_IN_BATCH-1)+1;
 			const uint triangleID = bitfieldExtract(visBuffer[0],31-triangleIDBitcount,triangleIDBitcount);
 			const uint batchInstanceGUID = bitfieldExtract(visBuffer[0],0,31-triangleIDBitcount);
-			const vec2 compactBary = unpackUnorm2x16(visBuffer[1]);
+//const vec2 compactBary = unpackUnorm2x16(visBuffer[1]);
 			#ifdef TEX_PREFETCH_STREAM
 			// TODO: separate pipeline and separate out the barycentric derivative FBO attachment, only write if need to, only fetch if `needs_texture_prefetch`
 			const mat2 dBarydScreen = mat2(unpackHalf2x16(visBuffer[2]),unpackHalf2x16(visBuffer[3]));
@@ -57,17 +56,47 @@ void main()
 						
 			// load vertex data
 			const vec3 lastVxPos = load_positions(batchInstanceData,indices);
+			if (!bool(batchInstanceData.determinantSignBit&0x80000000u))
+				normalizedG = -normalizedG;
+			const float VdotG = dot(normalizedV,normalizedG);
+			const bool frontfacing = VdotG>=0.f;
 
-			// get material while waiting for indices
+			// get material
 			const nbl_glsl_MC_oriented_material_t material = nbl_glsl_MC_material_data_t_getOriented(batchInstanceData.material,frontfacing);
 			contrib.color = contrib.albedo = nbl_glsl_MC_oriented_material_t_getEmissive(material,normalizedV);
 
 			// little optimization for non-twosided materials
 			if (material.genchoice_count!=0u)
 			{
-				// get initial scramble key while waiting for vertex positions
+				// get initial scramble key
 				const nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,ivec2(outPixelLocation),0).rg;
 
+				vec3 origin;
+				#if RECOMPUTE_BARY
+					// we know the ray will intersect the triangle
+					vec2 compactBary;
+					{
+						// reversed order of arguments for each cross cause V is negative
+						const vec3 ray_cross_e2 = cross(dPdBary[1],normalizedV);
+						const float detRcp = 1.f/dot(dPdBary[0],ray_cross_e2);
+						// assert(!isinf(detRcp));
+						const vec3 s = (pc.cummon.viewDirReconFactors[3]-lastVxPos)*detRcp;
+						const float u = dot(s,ray_cross_e2);
+						// assert(0.f<=u && u<=1.f)
+						const vec3 s_cross_e1 = cross(s,dPdBary[0]);
+						const float v = -dot(normalizedV,s_cross_e1);
+						// assert(0.f<=v && v<=1.f)
+						compactBary = vec2(u,v);
+						//
+						const float t = dot(dPdBary[1],s_cross_e1);
+						//assert(t>0.f);
+						origin = pc.cummon.viewDirReconFactors[3]-normalizedV*t;
+					}
+				#else
+				const vec2 compactBary = unpackUnorm2x16(visBuffer[1]);
+				#endif
+				origin = dPdBary*compactBary+lastVxPos;
+
 				//
 				normalizedN = load_normal_and_prefetch_textures(
 					batchInstanceData,indices,compactBary,material
@@ -75,10 +104,6 @@ void main()
 					,dBarydScreen
 					#endif
 				);
-		
-				const vec3 origin = dPdBary*compactBary+lastVxPos;
-				// does this buy us any precision? (answer run CI!)
-				//normalizedV = normalize(pc.cummon.viewDirReconFactors[3]-origin);
 
 				// generate rays
 				const uint vertex_depth = 1u;
@@ -105,38 +130,57 @@ void main()
 			{
 				const uvec3 coord = uvec3(outPixelLocation,i);
 
-				nbl_glsl_RWMC_SplattingParameters splat = nbl_glsl_RWMC_getCascade(staticViewData.cascadeParams,luma);
-				const bool pathToBeContinued = bool((rayMask>>i)&0x1u);
-				if (pathToBeContinued)
-				{
-					storeAccumulation(contrib.color*pc.cummon.rcpFramesDispatched,coord);
-					splat.cascadeWeights = vec2(0.f,0.f);
-				}
-				const uint higherCascade = splat.lowerCascade+1u;
-				const uint cascadeCount = staticViewData.cascadeParams.penultimateCascadeIx+2u;
-				for (uint cascadeIx=0u; cascadeIx<cascadeCount; cascadeIx++)
+				if (isRWMCEnabled())
 				{
-					float weight = 0.f;
-					if (cascadeIx==splat.lowerCascade)
-						weight = splat.cascadeWeights[0];
-					else if (cascadeIx==higherCascade)
-						weight = splat.cascadeWeights[1];
-					nextSampleAccumulationCascade(
-						contrib.color*weight,coord,samplesPerPixelPerDispatch,
-						cascadeIx,pc.cummon.rcpFramesDispatched
-					);
+					nbl_glsl_RWMC_SplattingParameters splat = nbl_glsl_RWMC_getCascade(staticViewData.cascadeParams,luma);
+
+					// ray out of ray-payload, using first layer of accumulation texture to store emissive along path so far
+					const bool pathToBeContinued = bool((rayMask>>i)&0x1u);
+					if (pathToBeContinued)
+					{
+						storeAccumulation(contrib.color*pc.cummon.rcpFramesDispatched,coord);
+						splat.cascadeWeights = vec2(0.f,0.f);
+					}
+
+					const uint higherCascade = splat.lowerCascade+1u;
+					const uint cascadeCount = staticViewData.cascadeParams.penultimateCascadeIx+2u;
+					for (uint cascadeIx=0u; cascadeIx<cascadeCount; cascadeIx++)
+					{
+						float weight = 0.f;
+						if (cascadeIx==splat.lowerCascade)
+							weight = splat.cascadeWeights[0];
+						else if (cascadeIx==higherCascade)
+							weight = splat.cascadeWeights[1];
+						nextSampleAccumulationCascade(
+							firstFrame,contrib.color*weight,coord,samplesPerPixelPerDispatch,
+							cascadeIx,pc.cummon.rcpFramesDispatched
+						);
+					}
 				}
 
+				const bool hideEnvmap = bool(staticViewData.sampleSequenceStride_hideEnvmap>>31);
 				// clear accumulations totally if beginning a new frame
 				if (firstFrame)
 				{
+					if (!isRWMCEnabled())
+						storeAccumulation(contrib.color,coord);
 					storeAlbedo(contrib.albedo,coord);
 					storeWorldspaceNormal(contrib.worldspaceNormal,coord);
+					storeMask(hideEnvmap&&(!hit) ? 1.f:0.f,coord);
 				}
 				else
 				{
+					if (!isRWMCEnabled())
+					{
+						const vec3 prev = fetchAccumulation(coord);
+						const vec3 delta = (contrib.color-prev)*pc.cummon.rcpFramesDispatched;
+						if (any(greaterThan(abs(delta),vec3(exp2(-19.f)))))
+							storeAccumulation(prev+delta,coord);
+					}
 					addAlbedo(contrib.albedo,coord,pc.cummon.rcpFramesDispatched);
 					addWorldspaceNormal(contrib.worldspaceNormal,coord,pc.cummon.rcpFramesDispatched);
+					if (hideEnvmap)
+						addMask(hit ? 0.f:1.f,coord,pc.cummon.rcpFramesDispatched);
 				}
 			}
 		}
diff --git a/22.RaytracedAO/raytraceCommon.glsl b/22.RaytracedAO/raytraceCommon.glsl
index fe7fc667c..10f49273f 100644
--- a/22.RaytracedAO/raytraceCommon.glsl
+++ b/22.RaytracedAO/raytraceCommon.glsl
@@ -44,10 +44,11 @@ layout(set = 2, binding = 4) restrict coherent buffer RayCount // maybe remove c
 // aovs
 layout(set = 2, binding = 5, r32ui) restrict uniform uimage2DArray albedoAOV;
 layout(set = 2, binding = 6, r32ui) restrict uniform uimage2DArray normalAOV;
+layout(set = 2, binding = 7, r16) restrict uniform image2DArray maskAOV;
 // environment emitter
-layout(set = 2, binding = 7) uniform sampler2D envMap;
-layout(set = 2, binding = 8) uniform sampler2D warpMap; 
-layout(set = 2, binding = 9) uniform sampler2D luminance;
+layout(set = 2, binding = 8) uniform sampler2D envMap;
+layout(set = 2, binding = 9) uniform sampler2D warpMap; 
+layout(set = 2, binding = 10) uniform sampler2D luminance;
 
 void clear_raycount()
 {
@@ -73,6 +74,11 @@ uvec3 get_triangle_indices(in nbl_glsl_ext_Mitsuba_Loader_instance_data_t batchI
 #include <nbl/builtin/glsl/format/decode.glsl>
 #include <nbl/builtin/glsl/format/encode.glsl>
 
+bool isRWMCEnabled()
+{
+	return staticViewData.cascadeParams.penultimateCascadeIx!=uint(-2);
+}
+
 vec3 fetchAccumulation(in uvec3 coord)
 {
 	const uvec2 data = imageLoad(accumulation,ivec3(coord)).rg;
@@ -85,7 +91,7 @@ void storeAccumulation(in vec3 color, in uvec3 coord)
 }
 void addAccumulation(in vec3 delta, in uvec3 coord)
 {
-	if (any(greaterThan(delta,vec3(exp2(-19.f)))))
+	if (any(greaterThan(abs(delta),vec3(exp2(-19.f)))))
 	{
 		const vec3 prev = fetchAccumulation(coord);
 		const vec3 newVal = prev+delta;
@@ -98,20 +104,18 @@ void addAccumulation(in vec3 delta, in uvec3 coord)
 
 // TODO: use a R17G17B17_UNORM format matched to cascade range, then use 13 bits to store last spp count (max 8k spp renders)
 // This way we can avoid writing every cascade every path storage
-void nextSampleAccumulationCascade(in vec3 weightedDelta, uvec3 coord, in uint samplesPerPixelPerDispatch, in uint cascadeIndex, in float rcpN)
+void nextSampleAccumulationCascade(in bool firstFrame, in vec3 weightedDelta, uvec3 coord, in uint samplesPerPixelPerDispatch, in uint cascadeIndex, in float rcpN)
 {
 	// but leave first index in the array for the ray accumulation metadata, hence the +1
 	coord.z += (cascadeIndex+1u)*samplesPerPixelPerDispatch;
-	const vec3 prev = fetchAccumulation(coord);
+	const vec3 prev = firstFrame ? vec3(0.0):fetchAccumulation(coord);
 	const vec3 newVal = prev+(weightedDelta-prev)*rcpN;
-	// TODO: do a better check, compare actually encoded values for difference
-	const uvec3 diff = floatBitsToUint(newVal)^floatBitsToUint(prev);
-	if (bool((diff.x|diff.y|diff.z)&0x7ffffff0u))
-		storeAccumulation(newVal,coord);
+	// always store, cause we need to reset the value
+	storeAccumulation(newVal,coord);
 }
 void addAccumulationCascade(in vec3 weightedDelta, uvec3 coord, in uint samplesPerPixelPerDispatch, in uint cascadeIndex)
 {
-	if (any(greaterThan(weightedDelta,vec3(exp2(-19.f)))))
+	if (any(greaterThan(abs(weightedDelta),vec3(exp2(-19.f)))))
 	{
 		// but leave first index in the array for the ray accumulation metadata, hence the +1
 		coord.z += (cascadeIndex+1u)*samplesPerPixelPerDispatch;
@@ -174,6 +178,29 @@ void addWorldspaceNormal(vec3 delta, in uvec3 coord)
 	impl_addWorldspaceNormal(delta,coord,0.f,false);
 }
 
+void storeMask(in float mask, in uvec3 coord)
+{
+	imageStore(maskAOV,ivec3(coord),vec4(mask,0.f,0.f,0.f));
+}
+void impl_addMask(float delta, in uvec3 coord, in float rcpN, in bool newSample)
+{
+	const float prev = imageLoad(maskAOV,ivec3(coord)).r;
+	if (newSample)
+		delta = (delta-prev)*rcpN;
+	if (abs(delta)>1.f/65536.f)
+		storeMask(prev+delta,coord);
+}
+// for starting a new sample
+void addMask(float delta, in uvec3 coord, in float rcpN)
+{
+	impl_addMask(delta,coord,rcpN,true);
+}
+// for adding to the last sample
+void addMask(float delta, in uvec3 coord)
+{
+	impl_addMask(delta,coord,0.f,false);
+}
+
 // due to memory limitations we can only do 6k renders
 // so that's 13 bits for width, 12 bits for height, which leaves us with 7 bits for throughput
 void packOutPixelLocationAndAoVThroughputFactor(out float val, in uvec2 outPixelLocation, in float aovThroughputFactor)
@@ -279,6 +306,10 @@ vec3 load_normal_and_prefetch_textures(
 
 		dUVdBary = mat2(uvs[0]-uvs[2],uvs[1]-uvs[2]);
 		const vec2 UV = dUVdBary*compactBary+uvs[2];
+		// flip the tangent frame if mesh got flipped to undo Left Handed tangent frame
+		if (!bool(batchInstanceData.determinantSignBit&0x80000000u))
+			dUVdBary = -dUVdBary;
+		// the direction/winding of the UV-space parallelogram doesn't matter for texture filtering
 		const mat2 dUVdScreen = nbl_glsl_applyChainRule2D(dUVdBary,dBarydScreen);
 		nbl_glsl_MC_runTexPrefetchStream(tps,UV,dUVdScreen*pc.cummon.textureFootprintFactor);
 	}
@@ -321,7 +352,7 @@ mat2x3 rand6d(in uvec3 scramble_keys[2], in int _sample, int depth)
 	// decrement depth because first vertex is rasterized and picked with a different sample sequence
 	--depth;
 	//
-	const int offset = int(_sample*staticViewData.sampleSequenceStride)+depth*SAMPLING_STRATEGY_COUNT;
+	const int offset = _sample*int(staticViewData.sampleSequenceStride_hideEnvmap&0x7fFFffFFu)+depth*SAMPLING_STRATEGY_COUNT;
 
 	const nbl_glsl_sampling_quantized3D quant1 = texelFetch(quantizedSampleSequence, offset).xy;
 	const nbl_glsl_sampling_quantized3D quant2 = texelFetch(quantizedSampleSequence, offset+1).xy;
@@ -508,8 +539,7 @@ uint generate_next_rays(
 
 	// the 1.03125f adjusts for the fact that the normal might be too short (inversesqrt precision)
 	const float inversesqrt_precision = 1.03125f;
-	// TODO: investigate why we can't use `normalizedN` here
-	const vec3 ray_offset_vector = normalize(cross(dPdBary[0],dPdBary[1]))*inversesqrt_precision;
+	const vec3 ray_offset_vector = normalizedG*inversesqrt_precision;
     float origin_offset = nbl_glsl_numeric_limits_float_epsilon(120u); // I pulled the constants out of my @$$
     origin_offset += dot(abs(ray_offset_vector),abs(origin))*nbl_glsl_numeric_limits_float_epsilon(128u);
 
@@ -523,6 +553,7 @@ uint generate_next_rays(
 	//const vec3 geomNormal = cross(dPdBary[0],dPdBary[1]);
 	//float ray_offset = ?;
 	//ray_offset = nbl_glsl_ieee754_next_ulp_away_from_zero(ray_offset);
+
 	const vec3 ray_offset = ray_offset_vector*origin_offset;
 	const vec3 ray_origin[2] = {origin+ray_offset,origin-ray_offset};
 	uint offset = 0u;
diff --git a/22.RaytracedAO/raytraceCommon.h b/22.RaytracedAO/raytraceCommon.h
index a070b2a94..595fc7198 100644
--- a/22.RaytracedAO/raytraceCommon.h
+++ b/22.RaytracedAO/raytraceCommon.h
@@ -98,11 +98,13 @@ struct StaticViewData_t
 	uint8_t maxPathDepth;
 	uint8_t noRussianRouletteDepth;
 	uint16_t samplesPerPixelPerDispatch;
+	uint32_t sampleSequenceStride : 31;
+	uint32_t hideEnvmap : 1;
 #else
 	uint imageDimensions;
 	uint maxPathDepth_noRussianRouletteDepth_samplesPerPixelPerDispatch;
+	uint sampleSequenceStride_hideEnvmap;
 #endif
-	uint sampleSequenceStride; // this is a very small number actually, probably 20 bits left to play with
 	float envMapPDFNormalizationFactor;
 	nbl_glsl_RWMC_CascadeParameters cascadeParams;
 };
diff --git a/22.RaytracedAO/resolve.comp b/22.RaytracedAO/resolve.comp
index b46b0f725..33541d08a 100644
--- a/22.RaytracedAO/resolve.comp
+++ b/22.RaytracedAO/resolve.comp
@@ -13,9 +13,10 @@ layout(set = 0, binding = 0, row_major) uniform StaticViewData
 layout(set = 0, binding = 1) uniform usampler2DArray colorSamples;
 layout(set = 0, binding = 2) uniform sampler2DArray albedoSamples;
 layout(set = 0, binding = 3) uniform usampler2DArray normalSamples;
-layout(set = 0, binding = 4, rgba16f) restrict uniform image2D framebuffer;
-layout(set = 0, binding = 5, r32ui) restrict uniform uimage2D albedo;
-layout(set = 0, binding = 6, rgba16f) restrict uniform image2D normals;
+layout(set = 0, binding = 4) uniform sampler2DArray maskSamples;
+layout(set = 0, binding = 5, rgba16f) restrict uniform image2D framebuffer;
+layout(set = 0, binding = 6, r32ui) restrict uniform uimage2D albedo;
+layout(set = 0, binding = 7, rgba16f) restrict uniform image2D normals;
 
 layout(push_constant, row_major) uniform PushConstants
 {
@@ -60,6 +61,10 @@ vec3 nbl_glsl_RWMC_sampleCascadeTexel(ivec2 coord, in ivec2 offset, in uint casc
 	return value/float(samplesPerPixelPerDispatch);
 }
 
+bool isRWMCEnabled()
+{
+	return staticViewData.cascadeParams.penultimateCascadeIx!=uint(-2);
+}
 
 void main()
 {
@@ -68,22 +73,30 @@ void main()
 	{
 		samplesPerPixelPerDispatch = bitfieldExtract(staticViewData.maxPathDepth_noRussianRouletteDepth_samplesPerPixelPerDispatch,16,16);
 
-		vec3 acc = nbl_glsl_RWMC_reweight(pc.rwmcReweightingParams,pixelCoord);
+		vec3 acc;
+		if (isRWMCEnabled())
+			acc = nbl_glsl_RWMC_reweight(pc.rwmcReweightingParams,pixelCoord);
+		else // its a pretty ok function, reusing it
+			acc = nbl_glsl_RWMC_sampleCascadeTexel(pixelCoord,ivec2(0,0),-1);
+
 		vec3 alb = texelFetch(albedoSamples,ivec3(pixelCoord,0),0).rgb;
 		vec3 nml = nbl_glsl_decodeRGB10A2_SNORM(texelFetch(normalSamples,ivec3(pixelCoord,0),0).r).xyz;
+		float msk = texelFetch(maskSamples,ivec3(pixelCoord,0),0).r;
 
 		for (uint i=1u; i<samplesPerPixelPerDispatch; i++)
 		{
 			alb += texelFetch(albedoSamples,ivec3(pixelCoord,i),0).rgb;
 			nml += nbl_glsl_decodeRGB10A2_SNORM(texelFetch(normalSamples,ivec3(pixelCoord,i),0).r).xyz;
+			msk += texelFetch(maskSamples,ivec3(pixelCoord,i),0).r;
 		}
 		alb /= float(samplesPerPixelPerDispatch);
 		nml /= float(samplesPerPixelPerDispatch);
+		msk /= float(samplesPerPixelPerDispatch);
 
 		// transform normal
 		nml = mat3(pc.viewMatrix)*nml;
 
-		imageStore(framebuffer,pixelCoord,vec4(acc,1.f));
+		imageStore(framebuffer,pixelCoord,vec4(acc,1.f-msk));
 		imageStore(albedo,pixelCoord,uvec4(nbl_glsl_encodeRGB10A2_UNORM(vec4(alb,1.0)),0u,0u,0u));
 		imageStore(normals,pixelCoord,vec4(nml,1.f));
 	}
diff --git a/22.RaytracedAO/virtualGeometry.glsl b/22.RaytracedAO/virtualGeometry.glsl
index b500124aa..422c939f9 100644
--- a/22.RaytracedAO/virtualGeometry.glsl
+++ b/22.RaytracedAO/virtualGeometry.glsl
@@ -27,15 +27,15 @@ vec3 nbl_glsl_fetchVtxPos(in uint vtxID, in nbl_glsl_ext_Mitsuba_Loader_instance
 vec3 nbl_glsl_fetchVtxNormal(in uint vtxID, in nbl_glsl_ext_Mitsuba_Loader_instance_data_t batchInstanceData)
 {
     nbl_glsl_VG_VirtualAttributePacked_t va = batchInstanceData.determinantSignBit;
-    const uint codedNormal = nbl_glsl_VG_attribFetch2u(va,vtxID)[0];
+    const uint codedNormal = nbl_glsl_VG_attribFetch3u(va,vtxID)[0];
     return normalize(nbl_glsl_decodeRGB10A2_SNORM(codedNormal).xyz);
 }
 
 vec2 nbl_glsl_fetchVtxUV(in uint vtxID, in nbl_glsl_ext_Mitsuba_Loader_instance_data_t batchInstanceData)
 {
     nbl_glsl_VG_VirtualAttributePacked_t va = batchInstanceData.determinantSignBit;
-    const uint codedUV = nbl_glsl_VG_attribFetch2u(va,vtxID)[1];
-    return unpackHalf2x16(codedUV).xy;
+    const uvec2 codedUV = nbl_glsl_VG_attribFetch3u(va,vtxID).yz;
+    return vec2(uintBitsToFloat(codedUV.x), uintBitsToFloat(codedUV.y));
 }
 
 
diff --git a/39.DenoiserTonemapper/ShaderCommon.glsl b/39.DenoiserTonemapper/ShaderCommon.glsl
index da7e08f1e..dd4bae7ba 100644
--- a/39.DenoiserTonemapper/ShaderCommon.glsl
+++ b/39.DenoiserTonemapper/ShaderCommon.glsl
@@ -37,13 +37,7 @@ uint nbl_glsl_ext_FFT_Parameters_t_getPaddingType()
 #define _NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_ 16384
 
 
-#define SHARED_CHANNELS 3
-struct f16vec3_packed
-{
-	float16_t x;
-	float16_t y;
-	float16_t z;
-};
+//#define SHARED_CHANNELS 3
 
 
 // luma metering stuff
@@ -91,10 +85,10 @@ struct f16vec3_packed
 		return int((~pc.data.flags)&0x1u);
 	}
 
-	vec3 globalPixelData;
+	vec4 globalPixelData;
 	vec3 nbl_glsl_ext_LumaMeter_getColor(bool wgExecutionMask)
 	{
-		return globalPixelData;
+		return globalPixelData.rgb;
 	}
 #else
 	#include "nbl/builtin/glsl/ext/LumaMeter/common.glsl"
diff --git a/39.DenoiserTonemapper/main.cpp b/39.DenoiserTonemapper/main.cpp
index e5ef6dde5..bcb3fe319 100644
--- a/39.DenoiserTonemapper/main.cpp
+++ b/39.DenoiserTonemapper/main.cpp
@@ -80,7 +80,7 @@ int main(int argc, char* argv[])
 	params.Doublebuffer = true;
 	params.Stencilbuffer = false;
 	// TODO: this is a temporary fix for a problem solved in the Vulkan Branch
-	params.StreamingUploadBufferSize = 1024*1024*1024; // for Color + 2 AoV of 8k images
+	params.StreamingUploadBufferSize = (1024+512)*1024*1024; // for Color + 2 AoV of 8k images
 	params.StreamingDownloadBufferSize = core::roundUp(params.StreamingUploadBufferSize/3u,256u); // for output image
 	auto device = createDeviceEx(params);
 
@@ -130,30 +130,10 @@ int main(int argc, char* argv[])
 	if (check_error(!m_optixContext, "Could not create Optix Context!"))
 		return error_code;
 
-	constexpr auto forcedOptiXFormat = OPTIX_PIXEL_FORMAT_HALF3; // TODO: make more denoisers with formats
-	E_FORMAT nblFmtRequired = EF_UNKNOWN;
-	switch (forcedOptiXFormat)
-	{
-		case OPTIX_PIXEL_FORMAT_UCHAR3:
-			nblFmtRequired = EF_R8G8B8_SRGB;
-			break;
-		case OPTIX_PIXEL_FORMAT_UCHAR4:
-			nblFmtRequired = EF_R8G8B8A8_SRGB;
-			break;
-		case OPTIX_PIXEL_FORMAT_HALF3:
-			nblFmtRequired = EF_R16G16B16_SFLOAT;
-			break;
-		case OPTIX_PIXEL_FORMAT_HALF4:
-			nblFmtRequired = EF_R16G16B16A16_SFLOAT;
-			break;
-		case OPTIX_PIXEL_FORMAT_FLOAT3:
-			nblFmtRequired = EF_R32G32B32_SFLOAT;
-			break;
-		case OPTIX_PIXEL_FORMAT_FLOAT4:
-			nblFmtRequired = EF_R32G32B32A32_SFLOAT;
-			break;
-	}
-	constexpr auto forcedOptiXFormatPixelStride = 6u;
+	// TODO: make more denoisers with formats
+	constexpr OptixPixelFormat forcedOptiXFormats[] = {OPTIX_PIXEL_FORMAT_HALF4,OPTIX_PIXEL_FORMAT_HALF3,OPTIX_PIXEL_FORMAT_HALF3};
+	const uint32_t forcedOptiXFormatPixelStrides[] = {8,6,6};
+	const uint32_t forcedOptiXFormatPixelCumExclSizes[] = {0,8,14,20};
 	DenoiserToUse denoisers[EII_COUNT];
 	{
 		OptixDenoiserOptions opts = { OPTIX_DENOISER_INPUT_RGB };
@@ -175,6 +155,7 @@ int main(int argc, char* argv[])
 	using ToneMapperClass = ext::ToneMapper::CToneMapper;
 
 	constexpr uint32_t kComputeWGSize = FFTClass::DEFAULT_WORK_GROUP_SIZE; // if it changes, maybe it breaks stuff
+	constexpr uint32_t allChannelsFFT = 4u;
 	constexpr uint32_t colorChannelsFFT = 3u;
 	constexpr bool usingHalfFloatFFTStorage = false;
 
@@ -344,18 +325,16 @@ layout(binding = 0, std430) restrict readonly buffer ImageInputBuffer
 } inBuffers[EII_COUNT];
 layout(binding = 1, std430) restrict writeonly buffer ImageOutputBuffer
 {
-	f16vec3_packed data[];
+	float16_t data[];
 } outBuffers[EII_COUNT];
-vec3 fetchData(in uvec3 texCoord)
+vec4 fetchData(in uvec3 texCoord)
 {
-	vec3 data = vec4(inBuffers[texCoord.z].data[texCoord.y*pc.data.inImageTexelPitch[texCoord.z]+texCoord.x]).xyz;
-	bool invalid = any(isnan(data))||any(isinf(abs(data)));
+	vec4 data = vec4(inBuffers[texCoord.z].data[texCoord.y*pc.data.inImageTexelPitch[texCoord.z]+texCoord.x]);
+	const bool invalid = any(isnan(data.rgb))||any(isinf(abs(data.rgb)));
 	if (texCoord.z==EII_ALBEDO)
-		data = invalid ? vec3(1.0):data;
+		data.rgb = invalid ? vec3(1.0):data.rgb;
 	else if (texCoord.z==EII_NORMAL)
-	{
-		data = invalid||length(data)<0.000000001 ? vec3(0.0,0.0,1.0):normalize(pc.data.normalMatrix*data);
-	}
+		data.xyz = invalid||length(data.xyz)<0.000000001 ? vec3(0.0,0.0,1.0):normalize(pc.data.normalMatrix*data.xyz);
 	return data;
 }
 void main()
@@ -367,10 +346,12 @@ void main()
 		nbl_glsl_ext_LumaMeter(colorLayer && gl_GlobalInvocationID.x<pc.data.imageWidth);
 		barrier();
 	}
-	const uint addr = gl_GlobalInvocationID.y*pc.data.imageWidth+gl_GlobalInvocationID.x;
-	outBuffers[gl_GlobalInvocationID.z].data[addr].x = float16_t(globalPixelData.x);
-	outBuffers[gl_GlobalInvocationID.z].data[addr].y = float16_t(globalPixelData.y);
-	outBuffers[gl_GlobalInvocationID.z].data[addr].z = float16_t(globalPixelData.z);
+	const uint addr = (gl_GlobalInvocationID.y*pc.data.imageWidth+gl_GlobalInvocationID.x)*(colorLayer ? 4:3);
+	outBuffers[gl_GlobalInvocationID.z].data[addr+0] = float16_t(globalPixelData.r);
+	outBuffers[gl_GlobalInvocationID.z].data[addr+1] = float16_t(globalPixelData.g);
+	outBuffers[gl_GlobalInvocationID.z].data[addr+2] = float16_t(globalPixelData.b);
+	if (colorLayer)
+		outBuffers[gl_GlobalInvocationID.z].data[addr+3] = float16_t(globalPixelData.a);
 }
 		)==="));
 		auto intensityShader = driver->createGPUShader(core::make_smart_refctd_ptr<ICPUShader>(R"===(
@@ -428,12 +409,12 @@ void main()
 #include "../ShaderCommon.glsl"
 layout(binding = 0, std430) restrict readonly buffer DenoisedImageInputBuffer
 {
-	f16vec3_packed inDenoisedBuffer[];
+	uvec2 inDenoisedBuffer[];
 };
 #define _NBL_GLSL_EXT_FFT_INPUT_DESCRIPTOR_DEFINED_
 layout(binding = 1, std430) restrict buffer NoisyImageInputBufferAndSpectrumOutputBuffer
 {
-	uint16_t data[];
+	uvec2 data[];
 } aliasedBuffer[2];
 #define _NBL_GLSL_EXT_FFT_OUTPUT_DESCRIPTOR_DEFINED_
 
@@ -466,12 +447,7 @@ uint nbl_glsl_ext_FFT_Parameters_t_getDirection()
 void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_complex complex_value)
 {
 	const uint index = ((channel<<CommonPushConstants_getPassLog2FFTSize(0))+coordinate.x)*pc.data.imageHeight+coordinate.y;
-
-	const uvec2 asUint = floatBitsToUint(complex_value);
-	aliasedBuffer[1].data[index*4+0] = uint16_t(asUint.x&0xffffu);
-	aliasedBuffer[1].data[index*4+1] = uint16_t(asUint.x>>16);
-	aliasedBuffer[1].data[index*4+2] = uint16_t(asUint.y&0xffffu);
-	aliasedBuffer[1].data[index*4+3] = uint16_t(asUint.y>>16);
+	aliasedBuffer[1].data[index] = floatBitsToUint(complex_value);
 }
 #define _NBL_GLSL_EXT_FFT_SET_DATA_DEFINED_
 
@@ -480,7 +456,7 @@ void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_
 #include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
 
 
-vec3 preloadedPixels[(_NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_-1u)/_NBL_GLSL_WORKGROUP_SIZE_+1u];
+vec4 preloadedPixels[(_NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_-1u)/_NBL_GLSL_WORKGROUP_SIZE_+1u];
 
 void main()
 {
@@ -502,21 +478,25 @@ void main()
 		ivec3 coordinate = oldCoord; nbl_glsl_ext_FFT_wrap_coord(coordinate);
 		//
 		const uint index = coordinate.y*pc.data.imageWidth+coordinate.x;
-		const vec3 denoised = vec3(inDenoisedBuffer[index].x,inDenoisedBuffer[index].y,inDenoisedBuffer[index].z);
-		vec3 noisy;
-		for (uint c=0; c<3; c++)
-			noisy[c] = unpackHalf2x16(uint(aliasedBuffer[0].data[index*3+c]))[0];
+		const uvec2 denoisedData = inDenoisedBuffer[index];
+		const vec4 denoised = vec4(unpackHalf2x16(denoisedData[0]),unpackHalf2x16(denoisedData[1]));
+		vec4 noisy;
+		{
+			uvec2 noisyData = aliasedBuffer[0].data[index];
+			noisy.rg = unpackHalf2x16(noisyData[0]);
+			noisy.ba = unpackHalf2x16(noisyData[1]); // error "warning C7050: "noisy.zw" might be used before being initialized" is wrong
+		}
 		preloadedPixels[t] = mix(denoised,noisy,pc.data.denoiseBlendFactor);
 		//
 		const bool contributesToLuma = all(equal(coordinate,oldCoord));
-		scaledLogLuma += nbl_glsl_ext_LumaMeter_local_process(contributesToLuma,preloadedPixels[t]);
+		scaledLogLuma += nbl_glsl_ext_LumaMeter_local_process(contributesToLuma,preloadedPixels[t].rgb);
 	}
 	nbl_glsl_ext_LumaMeter_setFirstPassOutput(nbl_glsl_ext_LumaMeter_workgroup_process(scaledLogLuma));
 	// prevent overlap between different usages of shared memory
 	barrier();
 
 	// Virtual Threads Calculation
-	for(uint channel=0u; channel<3u; channel++)
+	for(uint channel=0u; channel<4u; channel++)
 	{
 		for (uint t=0u; t<item_per_thread_count; t++)
 			nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex(preloadedPixels[t][channel],0.f);
@@ -600,7 +580,16 @@ void convolve(in uint item_per_thread_count, in uint ch)
 
 		uv += pc.data.kernel_half_pixel_size;
 		//
-		nbl_glsl_complex convSpectrum = textureLod(NormalizedKernel[ch],uv,0).xy;
+		nbl_glsl_complex convSpectrum = textureLod(NormalizedKernel[min(ch,2)],uv,0).xy;
+		// alpha kernel is just a grayscale/luma version of the RGB one
+		if (ch==3)
+		{
+			convSpectrum *= nbl_glsl_sRGBtoXYZ[2][1];
+			for (uint c=0; c<2; c++)
+				convSpectrum += textureLod(NormalizedKernel[c],uv,0).xy*nbl_glsl_sRGBtoXYZ[c][1];
+			// small boost because the spectra don't normalize ideally
+			convSpectrum *= 1.2f;
+		}
 		nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
 	}
 }
@@ -610,7 +599,7 @@ void main()
 	// Virtual Threads Calculation
 	const uint log2FFTSize = nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize();
 	const uint item_per_thread_count = 0x1u<<(log2FFTSize-_NBL_GLSL_WORKGROUP_SIZE_LOG2_);
-	for(uint channel=0u; channel<3u; channel++)
+	for(uint channel=0u; channel<4u; channel++)
 	{
 		// Load Values into local memory
 		for(uint t=0u; t<item_per_thread_count; t++)
@@ -708,12 +697,12 @@ void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_
 		return;
 	
 	uint dataOffset = coords.y*pc.data.inImageTexelPitch[EII_COLOR]+coords.x;	
-	vec3 color = vec4(outBuffer[dataOffset]).xyz;
+	vec4 color = vec4(outBuffer[dataOffset]);
 	color[channel] = complex_value.x;
-	if (channel==nbl_glsl_ext_FFT_Parameters_t_getMaxChannel())
+	if (channel==3)
 	{
-		color = _NBL_GLSL_EXT_LUMA_METER_XYZ_CONVERSION_MATRIX_DEFINED_*color;
-		color *= intensity[pc.data.intensityBufferDWORDOffset]; // *= 0.18/AvgLuma
+		color.rgb = _NBL_GLSL_EXT_LUMA_METER_XYZ_CONVERSION_MATRIX_DEFINED_*color.rgb;
+		color.rgb *= intensity[pc.data.intensityBufferDWORDOffset]; // *= 0.18/AvgLuma
 		switch (pc.data.tonemappingOperator)
 		{
 			case _NBL_GLSL_EXT_TONE_MAPPER_REINHARD_OPERATOR:
@@ -721,7 +710,7 @@ void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_
 				nbl_glsl_ext_ToneMapper_ReinhardParams_t tonemapParams;
 				tonemapParams.keyAndManualLinearExposure = pc.data.tonemapperParams[0];
 				tonemapParams.rcpWhite2 = pc.data.tonemapperParams[1];
-				color = nbl_glsl_ext_ToneMapper_Reinhard(tonemapParams,color);
+				color.rgb = nbl_glsl_ext_ToneMapper_Reinhard(tonemapParams,color.rgb);
 				break;
 			}
 			case _NBL_GLSL_EXT_TONE_MAPPER_ACES_OPERATOR:
@@ -729,18 +718,19 @@ void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_
 				nbl_glsl_ext_ToneMapper_ACESParams_t tonemapParams;
 				tonemapParams.gamma = pc.data.tonemapperParams[0];
 				tonemapParams.exposure = pc.data.tonemapperParams[1];
-				color = nbl_glsl_ext_ToneMapper_ACES(tonemapParams,color);
+				color.rgb = nbl_glsl_ext_ToneMapper_ACES(tonemapParams,color.rgb);
 				break;
 			}
 			default:
 			{
-				color *= pc.data.tonemapperParams[0];
+				color.rgb *= pc.data.tonemapperParams[0];
 				break;
 			}
 		}
-		color = nbl_glsl_XYZtosRGB*color;
+		color.rgb = nbl_glsl_XYZtosRGB*color.rgb;
+		color.a = clamp(color.a,0.f,1.f);
 	}
-	outBuffer[dataOffset] = f16vec4(vec4(color,1.f));
+	outBuffer[dataOffset] = f16vec4(color);
 }
 #define _NBL_GLSL_EXT_FFT_SET_DATA_DEFINED_
 
@@ -754,7 +744,7 @@ void main()
 	// Virtual Threads Calculation
 	const uint log2FFTSize = nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize();
 	const uint item_per_thread_count = 0x1u<<(log2FFTSize-_NBL_GLSL_WORKGROUP_SIZE_LOG2_);
-	for(uint channel=0u; channel<3u; channel++)
+	for(uint channel=0u; channel<4u; channel++)
 	{
 		// Load Values into local memory
 		for(uint t=0u; t<item_per_thread_count; t++)
@@ -891,7 +881,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 		asset::IAssetLoader::SAssetLoadParams lp(0ull,nullptr);
 		auto default_kernel_image_bundle = am->getAsset("../../media/kernels/physical_flare_512.exr",lp); // TODO: make it a builtins?
 
-		for (size_t i=0; i < inputFilesAmount; i++)
+		for (size_t i=0; i<inputFilesAmount; i++)
 		{
 			const auto imageIDString = makeImageIDString(i, colorFileNameBundle);
 
@@ -1017,9 +1007,11 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 				// compute storage size and check if we can successfully upload
 				{
 					auto regions = colorImage->getRegions();
+					// no mip chain, etc.
 					assert(regions.begin()+1u==regions.end());
 
 					const auto& region = regions.begin()[0];
+					// there is an explicit buffer row length
 					assert(region.bufferRowLength);
 					outParam.colorTexelSize = asset::getTexelOrBlockBytesize(colorCreationParams.format);
 				}
@@ -1028,6 +1020,8 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 				{
 					auto kerDim = outParam.kernel->getCreationParameters().extent;
 					float kernelScale,minKernelScale;
+					// portrait vs landscape, get smallest dimension
+					// the kernelScale makes sure that resampled kernel resolution will match the image to be blurred scaled by `bloomRelativeScale`
 					if (extent.width<extent.height)
 					{
 						minKernelScale = 2.f/float(kerDim.width);
@@ -1038,15 +1032,17 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 						minKernelScale = 2.f/float(kerDim.height);
 						kernelScale = float(extent.height)*bloomRelativeScale/float(kerDim.height);
 					}
-					//
+					// kernel is being upsampled and bilinear interpolation artefacts will be visible
 					if (kernelScale>1.f)
 						os::Printer::log(imageIDString + "Bloom Kernel loose sharpness, increase resolution of bloom kernel or reduce its relative scale!", ELL_WARNING);
+					// kernel cannot be smaller than 2x2
 					else if (kernelScale<minKernelScale)
 						os::Printer::log(imageIDString + "Bloom Kernel relative scale pathologically small, clamping to prevent division by 0!", ELL_WARNING);
 					outParam.scaledKernelExtent.width = core::max(core::ceil(float(kerDim.width)*kernelScale),2u);
 					outParam.scaledKernelExtent.height = core::max(core::ceil(float(kerDim.height)*kernelScale),2u);
 					outParam.scaledKernelExtent.depth = 1u;
 				}
+				// for every dimension axis we're blurring, we add padding equal to the bloom kernel to make sure we don't bleed across image edges
 				const auto marginSrcDim = [extent,outParam]() -> auto
 				{
 					auto tmp = extent;
@@ -1058,14 +1054,16 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 					}
 					return tmp;
 				}();
+				// we abuse the same buffer as temporary storage for the Kernel FFT (two spans needed)
 				fftScratchSize = core::max(FFTClass::getOutputBufferSize(usingHalfFloatFFTStorage,outParam.scaledKernelExtent,colorChannelsFFT)*2u,fftScratchSize);
-				fftScratchSize = core::max(FFTClass::getOutputBufferSize(usingHalfFloatFFTStorage,marginSrcDim,colorChannelsFFT),fftScratchSize);
+				// and for the main image FFT (alpha included)
+				fftScratchSize = core::max(FFTClass::getOutputBufferSize(usingHalfFloatFFTStorage,marginSrcDim,allChannelsFFT),fftScratchSize);
 				// TODO: maybe move them to nested loop and compute JIT
 				{
 					auto* fftPushConstants = outParam.fftPushConstants;
 					auto* fftDispatchInfo = outParam.fftDispatchInfo;
 					const ISampler::E_TEXTURE_CLAMP fftPadding[2] = {ISampler::ETC_MIRROR,ISampler::ETC_MIRROR};
-					const auto passes = FFTClass::buildParameters<false>(false,colorChannelsFFT,extent,fftPushConstants,fftDispatchInfo,fftPadding,marginSrcDim);
+					const auto passes = FFTClass::buildParameters<false>(false,allChannelsFFT,extent,fftPushConstants,fftDispatchInfo,fftPadding,marginSrcDim);
 					{
 						// override for less work and storage (dont need to store the extra padding of the last axis after iFFT)
 						fftPushConstants[1].output_strides.x = fftPushConstants[0].input_strides.x;
@@ -1081,6 +1079,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 						}
 						fftDispatchInfo[2] = fftDispatchInfo[0];
 					}
+					// only a 2D FFT
 					assert(passes==2);
 				}
 
@@ -1103,6 +1102,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 				{
 					os::Printer::log(imageIDString + "Image extent of the Albedo Channel does not match the Color Channel, Albedo Channel will not be used!", ELL_ERROR);
 					albedoImage = nullptr;
+					continue;
 				}
 				else
 					outParam.denoiserType = EII_ALBEDO;
@@ -1144,7 +1144,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 	size_t denoiserStateBufferSize = 0ull;
 	{
 		size_t scratchBufferSize = fftScratchSize;
-		size_t tempBufferSize = fftScratchSize;
+		size_t tempBufferSize = forcedOptiXFormatPixelCumExclSizes[EII_COUNT]*maxResolution[0]*maxResolution[1];
 		for (uint32_t i=0u; i<EII_COUNT; i++)
 		{
 			auto& denoiser = denoisers[i].m_denoiser;
@@ -1161,12 +1161,11 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 			}
 
 			denoisers[i].stateOffset = denoiserStateBufferSize;
-			denoiserStateBufferSize += denoisers[i].stateSize = m_denoiserMemReqs.stateSizeInBytes;
+			denoiserStateBufferSize += (denoisers[i].stateSize = m_denoiserMemReqs.stateSizeInBytes);
 			scratchBufferSize = core::max(scratchBufferSize, denoisers[i].scratchSize = m_denoiserMemReqs.withOverlapScratchSizeInBytes);
-			tempBufferSize = core::max(tempBufferSize,forcedOptiXFormatPixelStride*i*maxResolution[0]*maxResolution[1]);
 		}
 		// have to keep the FFT spectrum out of the noisy color storage
-		tempBufferSize += forcedOptiXFormatPixelStride*maxResolution[0]*maxResolution[1];
+		tempBufferSize = core::max(forcedOptiXFormatPixelStrides[0]*maxResolution[0]*maxResolution[1]+fftScratchSize,tempBufferSize);
 		std::string message = "Total VRAM consumption for Denoiser algorithm: ";
 		os::Printer::log(message+std::to_string(denoiserStateBufferSize+scratchBufferSize+tempBufferSize), ELL_INFORMATION);
 
@@ -1201,7 +1200,8 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 			shaderConstants.imageWidth = param.width;
 			shaderConstants.imageHeight = param.height;
 			shaderConstants.denoiseBlendFactor = denoiserBlendFactorBundle[i].value();
-
+			
+			// offset is divisible by the intensity value size
 			assert(intensityBufferOffset%IntensityValuesSize==0u);
 			shaderConstants.intensityBufferDWORDOffset = intensityBufferOffset/IntensityValuesSize;
 			shaderConstants.denoiserExposureBias = denoiserExposureBiasBundle[i].value();
@@ -1307,7 +1307,6 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 
 				auto image = param.image[j];
 				const auto& creationParameters = image->getCreationParameters();
-				assert(asset::getTexelOrBlockBytesize(creationParameters.format)==param.colorTexelSize);
 				// set up some image pitch and offset info
 				shaderConstants.inImageTexelPitch[j] = image->getRegions().begin()[0].bufferRowLength;
 				inImageByteOffset[j] = offsetPair->getOffset();
@@ -1460,8 +1459,8 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 					// always need at least two input noisy buffers due to having to keep noisy colour around
 					for (uint32_t j=0u; j<core::max(denoiserInputCount,EII_ALBEDO+1); j++)
 					{
-						outImageByteOffset[j] = j*param.width*param.height*forcedOptiXFormatPixelStride;
-						attachBufferImageRange(writes[1].info+j,temporaryPixelBuffer.getObject(),outImageByteOffset[j],forcedOptiXFormatPixelStride);
+						outImageByteOffset[j] = param.width*param.height*forcedOptiXFormatPixelCumExclSizes[j];
+						attachBufferImageRange(writes[1].info+j,temporaryPixelBuffer.getObject(),outImageByteOffset[j],forcedOptiXFormatPixelStrides[j]);
 					}
 					// make sure noisy albedo gets reused for FFT, and the FFT scratch size is always larger
 					writes[1].info[EII_ALBEDO].buffer.size = fftScratchSize;
@@ -1522,7 +1521,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 				//invocation params
 				OptixDenoiserParams denoiserParams = {};
 				denoiserParams.blendFactor = 0.f; // OptiX bug makes whole denoise a single color if we set this to anything other than 0.f
-				denoiserParams.denoiseAlpha = 0u;
+				denoiserParams.denoiseAlpha = true;
 				denoiserParams.hdrIntensity = intensityBuffer.asBuffer.pointer + intensityBufferOffset;
 
 				//input with RGB, Albedo, Normals
@@ -1534,18 +1533,18 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 					denoiserInputs[k].data = temporaryPixelBuffer.asBuffer.pointer+outImageByteOffset[k];
 					denoiserInputs[k].width = param.width;
 					denoiserInputs[k].height = param.height;
-					denoiserInputs[k].rowStrideInBytes = param.width * forcedOptiXFormatPixelStride;
-					denoiserInputs[k].format = forcedOptiXFormat;
-					denoiserInputs[k].pixelStrideInBytes = forcedOptiXFormatPixelStride;
+					denoiserInputs[k].pixelStrideInBytes = forcedOptiXFormatPixelStrides[k];
+					denoiserInputs[k].rowStrideInBytes = param.width * denoiserInputs[k].pixelStrideInBytes;
+					denoiserInputs[k].format = forcedOptiXFormats[k];
 
 				}
 
 				denoiserOutput.data = colorPixelBuffer.asBuffer.pointer+inImageByteOffset[EII_COLOR];
 				denoiserOutput.width = param.width;
 				denoiserOutput.height = param.height;
-				denoiserOutput.rowStrideInBytes = param.width * forcedOptiXFormatPixelStride;
-				denoiserOutput.format = forcedOptiXFormat;
-				denoiserOutput.pixelStrideInBytes = forcedOptiXFormatPixelStride;
+				denoiserOutput.pixelStrideInBytes = forcedOptiXFormatPixelStrides[0];
+				denoiserOutput.rowStrideInBytes = param.width * denoiserOutput.pixelStrideInBytes;
+				denoiserOutput.format = forcedOptiXFormats[0];
 #if 1 // for easy debug with renderdoc disable optix stuff
 				//invoke
 				if (denoiser.m_denoiser->tileAndInvoke(
@@ -1618,6 +1617,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 
 			// image view
 			core::smart_refctd_ptr<ICPUImageView> imageView;
+			// size needed to download denoised, bloomed and tonemapped image
 			const uint32_t colorBufferBytesize = param.height*param.width*param.colorTexelSize;
 			{
 				// create image
@@ -1783,7 +1783,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
 
 			// convert to EF_R8G8B8_SRGB and save it as .png and .jpg
 			{
-				auto newImageView = getConvertedImageView(imageView->getCreationParameters().image, EF_R8G8B8_SRGB);
+				auto newImageView = getConvertedImageView(imageView->getCreationParameters().image, EF_R8G8B8A8_SRGB);
 				IAssetWriter::SAssetWriteParams wp(newImageView.get());
 				std::string fileName = outputFileBundle[i].value().c_str();
 
diff --git a/50.IESProfileTest/CMakeLists.txt b/50.IESProfileTest/CMakeLists.txt
index 71c002ad5..52e8e83f2 100644
--- a/50.IESProfileTest/CMakeLists.txt
+++ b/50.IESProfileTest/CMakeLists.txt
@@ -4,4 +4,7 @@ if(NOT RES)
 	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
 endif()
 
-nbl_create_executable_project("" "" "" "")
\ No newline at end of file
+nbl_create_executable_project("" "" "" nlohmann_json::nlohmann_json)
+
+add_dependencies(${EXECUTABLE_NAME} nlohmann_json::nlohmann_json)
+target_include_directories(${EXECUTABLE_NAME} PUBLIC $<TARGET_PROPERTY:nlohmann_json::nlohmann_json,INCLUDE_DIRECTORIES>)
\ No newline at end of file
diff --git a/50.IESProfileTest/compute/cdc.comp b/50.IESProfileTest/compute/cdc.comp
index 037d01a36..390d63acb 100644
--- a/50.IESProfileTest/compute/cdc.comp
+++ b/50.IESProfileTest/compute/cdc.comp
@@ -224,11 +224,11 @@ bool isWithinSCDomain(vec2 point)
 
 void main()
 {
-	const float VERTICAL_INVERSE = 1.0f / TEXTURE_SIZE;
-	const float HORIZONTAL_INVERSE = 1.0f / TEXTURE_SIZE;
-
-	const ivec2 pixelCoordinates = ivec2(gl_GlobalInvocationID.xy);
 	const ivec2 destinationSize = imageSize(outIESCandelaImage);
+	const ivec2 pixelCoordinates = ivec2(gl_GlobalInvocationID.xy);
+	
+	const float VERTICAL_INVERSE = 1.0f / float(destinationSize.x);
+	const float HORIZONTAL_INVERSE = 1.0f / float(destinationSize.y);
 	
 	if (all(lessThan(pixelCoordinates, destinationSize)))
 	{
diff --git a/50.IESProfileTest/compute/common.h b/50.IESProfileTest/compute/common.h
index 378625732..edbc94104 100644
--- a/50.IESProfileTest/compute/common.h
+++ b/50.IESProfileTest/compute/common.h
@@ -8,7 +8,6 @@
 #define M_HALF_PI M_PI/2.0f // would be cool if we have this define somewhere or GLSL do
 #define QUANT_ERROR_ADMISSIBLE 1/1024
 
-#define TEXTURE_SIZE 1024u
 #define WORKGROUP_SIZE 256u
 #define WORKGROUP_DIMENSION 16u
 
diff --git a/50.IESProfileTest/inputs.json b/50.IESProfileTest/inputs.json
new file mode 100644
index 000000000..d6b4ce528
--- /dev/null
+++ b/50.IESProfileTest/inputs.json
@@ -0,0 +1,14 @@
+{
+  "directories": [
+    "../media/mitsuba/ies/packages/leomoon-dot-com_ies-lights-pack/ies-lights-pack"
+  ],
+  "files": [
+    "../media/mitsuba/ies/ISOTROPIC/007cfb11e343e2f42e3b476be4ab684e.ies",
+	"../media/mitsuba/ies/ANIISOTROPIC/QUAD_SYMMETRY/0275171fb664c1b3f024d1e442a68d22.ies",
+	"../media/mitsuba/ies/ANIISOTROPIC/HALF_SYMMETRY/1392a1ba55b67d3e0ae7fd63527f3e78.ies",
+	"../media/mitsuba/ies/ANIISOTROPIC/OTHER_HALF_SYMMETRY/028e97564391140b1476695ae7a46fa4.ies",
+	"../media/mitsuba/ies/NO_LATERAL_SYMMET/4b88bf886b39cfa63094e70e1afa680e.ies"
+  ],
+  "gui": true,
+  "writeAssets": false
+}
\ No newline at end of file
diff --git a/50.IESProfileTest/main.cpp b/50.IESProfileTest/main.cpp
index 79106697b..7aa640f67 100644
--- a/50.IESProfileTest/main.cpp
+++ b/50.IESProfileTest/main.cpp
@@ -2,14 +2,37 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
+#define BENCHMARK_TILL_FIRST_FRAME
+
 #include <nabla.h>
 #include <chrono>
 #include <filesystem>
 #include "nbl/ext/ScreenShot/ScreenShot.h"
 #include "compute/common.h"
+#include <stdio.h>
+
+// small hack to compile with the json library
+namespace std 
+{
+    int sprintf_s(char* buffer, size_t size, const char* format, ...) {
+        va_list args;
+        va_start(args, format);
+        int result = ::sprintf_s(buffer, size, format, args);
+        va_end(args);
+        return result;
+    }
+}
+
+#include "nlohmann/json.hpp"
 
 using namespace nbl;
 using namespace core;
+using json = nlohmann::json;
+
+#ifdef BENCHMARK_TILL_FIRST_FRAME
+const std::chrono::steady_clock::time_point startBenchmark = std::chrono::high_resolution_clock::now();
+bool stopBenchamrkFlag = false;
+#endif
 
 class IESCompute
 {
@@ -71,7 +94,7 @@ class IESCompute
             driver->bindDescriptorSets(EPBP_COMPUTE, gpue.cPipeline->getLayout(), 0u, 1u, &gpue.cDescriptorSet.get(), nullptr);
             driver->pushConstants(gpue.cPipeline->getLayout(), asset::ISpecializedShader::ESS_COMPUTE, 0u, sizeof(PushConstant), &pushConstant);
 
-            _NBL_STATIC_INLINE_CONSTEXPR auto xGroups = (TEXTURE_SIZE - 1u) / WORKGROUP_DIMENSION + 1u;
+            const auto xGroups = (getActiveProfile().getOptimalIESResolution().x - 1u) / WORKGROUP_DIMENSION + 1u;
             driver->dispatch(xGroups, xGroups, 1u);
 
             COpenGLExtensionHandler::extGlMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT | GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
@@ -91,6 +114,16 @@ class IESCompute
         {
             driver->blitRenderTargets(fbo, nullptr, false, false);
             driver->endScene();
+
+            #ifdef BENCHMARK_TILL_FIRST_FRAME
+            if (!stopBenchamrkFlag)
+            {
+                const std::chrono::steady_clock::time_point stopBenchmark = std::chrono::high_resolution_clock::now();
+                auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stopBenchmark - startBenchmark);
+                std::cout << "Time taken till first render pass: " << duration.count() << " milliseconds" << std::endl;
+                stopBenchamrkFlag = true;
+            }
+            #endif
         }
 
         void updateZDegree(const asset::CIESProfile::IES_STORAGE_FORMAT& degreeOffset)
@@ -109,8 +142,11 @@ class IESCompute
                 generalPurposeOffset = newOffset;
 
                 // not elegant way to do it here but lets leave it as it is
-                updateCDescriptorSets();
-                pushConstant.maxIValueReciprocal = (float)getActiveProfile().getMaxCandelaValue();
+                updateCDescriptorSets(); // flush descriptor set
+                updateGDescriptorSets(); // flush descriptor set
+
+                const auto& profile = getActiveProfile();
+                pushConstant.maxIValue = (float)profile.getMaxCandelaValue();
             }
         }
 
@@ -143,18 +179,12 @@ class IESCompute
             };
 
             auto& gpue = m_gpue;
-
-            gpue.dImageIESC = std::move(createGPUImageView<asset::EF_R16_UNORM>(TEXTURE_SIZE, TEXTURE_SIZE));
-            gpue.dImageS = std::move(createGPUImageView<asset::EF_R32G32_SFLOAT>(TEXTURE_SIZE, TEXTURE_SIZE));
-            gpue.dImageD = std::move(createGPUImageView<asset::EF_R32G32B32A32_SFLOAT>(TEXTURE_SIZE, TEXTURE_SIZE));
-            gpue.dImageTMask = std::move(createGPUImageView<asset::EF_R8G8_UNORM>(TEXTURE_SIZE, TEXTURE_SIZE));
-
-            createSSBOBuffers();
+            createGPUDescriptors();
+            const auto initIdx = generalPurposeOffset;
 
             // Compute
             {
                 const std::vector<IGPUDescriptorSetLayout::SBinding> bindings = getCBindings();
-
                 {
                     auto descriptorSetLayout = driver->createGPUDescriptorSetLayout(bindings.data(), bindings.data() + bindings.size());
                     asset::SPushConstantRange range = { asset::ISpecializedShader::ESS_COMPUTE, 0u, sizeof(PushConstant) };
@@ -164,22 +194,6 @@ class IESCompute
                 }
 
                 {
-                    {
-                        {
-                            gpue.cinfos[EB_IMAGE_IES_C].desc = core::smart_refctd_ptr(gpue.dImageIESC);
-                            gpue.cinfos[EB_IMAGE_IES_C].image = { nullptr, asset::EIL_GENERAL };
-
-                            gpue.cinfos[EB_IMAGE_S].desc = core::smart_refctd_ptr(gpue.dImageS);
-                            gpue.cinfos[EB_IMAGE_S].image = { nullptr, asset::EIL_GENERAL };
-
-                            gpue.cinfos[EB_IMAGE_D].desc = core::smart_refctd_ptr(gpue.dImageD);
-                            gpue.cinfos[EB_IMAGE_D].image = { nullptr, asset::EIL_GENERAL };
-
-                            gpue.cinfos[EB_IMAGE_T_MASK].desc = core::smart_refctd_ptr(gpue.dImageTMask);
-                            gpue.cinfos[EB_IMAGE_T_MASK].image = { nullptr, asset::EIL_GENERAL };
-                        }
-                    }
-
                     for (auto i = 0; i < EB_SIZE; i++)
                     {
                         gpue.cwrites[i].dstSet = gpue.cDescriptorSet.get();
@@ -203,14 +217,7 @@ class IESCompute
 
             // Graphics
             {
-                const std::vector<IGPUDescriptorSetLayout::SBinding> bindings =
-                {
-                    { EB_IMAGE_IES_C, asset::EDT_COMBINED_IMAGE_SAMPLER, 1, asset::ISpecializedShader::ESS_FRAGMENT, nullptr },
-                    { EB_IMAGE_S, asset::EDT_COMBINED_IMAGE_SAMPLER, 1, asset::ISpecializedShader::ESS_FRAGMENT, nullptr },
-                    { EB_IMAGE_D, asset::EDT_COMBINED_IMAGE_SAMPLER, 1, asset::ISpecializedShader::ESS_FRAGMENT, nullptr },
-                    { EB_IMAGE_T_MASK, asset::EDT_COMBINED_IMAGE_SAMPLER, 1, asset::ISpecializedShader::ESS_FRAGMENT, nullptr }
-                };
-
+                const std::vector<IGPUDescriptorSetLayout::SBinding> bindings = getGBindings();
                 {
                     auto descriptorSetLayout = driver->createGPUDescriptorSetLayout(bindings.data(), bindings.data() + bindings.size());
 
@@ -238,39 +245,23 @@ class IESCompute
                     return driver->createGPUSampler({ asset::ISampler::ETC_CLAMP_TO_EDGE,asset::ISampler::ETC_CLAMP_TO_EDGE,asset::ISampler::ETC_CLAMP_TO_EDGE,asset::ISampler::ETBC_FLOAT_OPAQUE_BLACK,asset::ISampler::ETF_LINEAR,asset::ISampler::ETF_LINEAR,asset::ISampler::ESMM_LINEAR,0u,false,asset::ECO_ALWAYS });
                 };
 
-                _NBL_STATIC_INLINE_CONSTEXPR uint8_t NBL_D_IMAGES_AMOUNT = 4u;
+                gpue.sampler = createSampler();
 
-                IGPUDescriptorSet::SDescriptorInfo infos[NBL_D_IMAGES_AMOUNT];
+                for (auto i = 0; i < gpue.NBL_D_IMAGES_AMOUNT; i++)
                 {
-                    infos[EB_IMAGE_IES_C].desc = core::smart_refctd_ptr(gpue.dImageIESC);
-                    infos[EB_IMAGE_IES_C].image = { createSampler(),asset::EIL_SHADER_READ_ONLY_OPTIMAL};
-
-                    infos[EB_IMAGE_S].desc = core::smart_refctd_ptr(gpue.dImageS);
-                    infos[EB_IMAGE_S].image = { createSampler(),asset::EIL_SHADER_READ_ONLY_OPTIMAL };
-
-                    infos[EB_IMAGE_D].desc = core::smart_refctd_ptr(gpue.dImageD);
-                    infos[EB_IMAGE_D].image = { createSampler(),asset::EIL_SHADER_READ_ONLY_OPTIMAL };
-
-                    infos[EB_IMAGE_T_MASK].desc = core::smart_refctd_ptr(gpue.dImageTMask);
-                    infos[EB_IMAGE_T_MASK].image = { createSampler(),asset::EIL_SHADER_READ_ONLY_OPTIMAL };
+                    gpue.gwrites[i].dstSet = gpue.gDescriptorSet.get();
+                    gpue.gwrites[i].binding = i;
+                    gpue.gwrites[i].count = 1u;
+                    gpue.gwrites[i].arrayElement = 0u;
+                    gpue.gwrites[i].descriptorType = asset::EDT_COMBINED_IMAGE_SAMPLER;
+                    gpue.gwrites[i].info = gpue.ginfos + i;
                 }
 
-                video::IGPUDescriptorSet::SWriteDescriptorSet writes[NBL_D_IMAGES_AMOUNT];
-                for (auto i = 0; i < NBL_D_IMAGES_AMOUNT; i++)
-                {
-                    writes[i].dstSet = gpue.gDescriptorSet.get();
-                    writes[i].binding = i;
-                    writes[i].count = 1u;
-                    writes[i].arrayElement = 0u;
-                    writes[i].descriptorType = asset::EDT_COMBINED_IMAGE_SAMPLER;
-                    writes[i].info = &infos[i];
-                }
-
-                driver->updateDescriptorSets(NBL_D_IMAGES_AMOUNT, writes, 0u, nullptr);
+                updateGDescriptorSets();
             }
         }
 
-        void createSSBOBuffers()
+        void createGPUDescriptors()
         {
             auto createCPUBuffer = [&](const auto& pInput)
             {
@@ -293,22 +284,27 @@ class IESCompute
                 cssbod.hAngles = createGPUBuffer(createCPUBuffer(profile.getHoriAngles()));
                 cssbod.vAngles = createGPUBuffer(createCPUBuffer(profile.getVertAngles()));
                 cssbod.data = createGPUBuffer(createCPUBuffer(profile.getData()));
+
+                const auto optimalResolution = profile.getOptimalIESResolution();
+
+                cssbod.dImageIESC = std::move(createGPUImageView<asset::EF_R16_UNORM>(optimalResolution.x, optimalResolution.y));
+                cssbod.dImageS = std::move(createGPUImageView<asset::EF_R32G32_SFLOAT>(optimalResolution.x, optimalResolution.y));
+                cssbod.dImageD = std::move(createGPUImageView<asset::EF_R32G32B32A32_SFLOAT>(optimalResolution.x, optimalResolution.y));
+                cssbod.dImageTMask = std::move(createGPUImageView<asset::EF_R8G8_UNORM>(optimalResolution.x, optimalResolution.y));
             }
         }
 
         void updateCDescriptorSets()
         {
+            fillImageDescriptorInfo<EB_IMAGE_IES_C>(generalPurposeOffset, m_gpue.cinfos[EB_IMAGE_IES_C]);
+            fillImageDescriptorInfo<EB_IMAGE_S>(generalPurposeOffset, m_gpue.cinfos[EB_IMAGE_S]);
+            fillImageDescriptorInfo<EB_IMAGE_D>(generalPurposeOffset, m_gpue.cinfos[EB_IMAGE_D]);
+            fillImageDescriptorInfo<EB_IMAGE_T_MASK>(generalPurposeOffset, m_gpue.cinfos[EB_IMAGE_T_MASK]);
+
             fillSSBODescriptorInfo<EB_SSBO_HA>(generalPurposeOffset, m_gpue.cinfos[EB_SSBO_HA]);
             fillSSBODescriptorInfo<EB_SSBO_VA>(generalPurposeOffset, m_gpue.cinfos[EB_SSBO_VA]);
             fillSSBODescriptorInfo<EB_SSBO_D>(generalPurposeOffset, m_gpue.cinfos[EB_SSBO_D]);
 
-            const std::vector<IGPUDescriptorSetLayout::SBinding> bindings = getCBindings();
-            {
-                auto descriptorSetLayout = driver->createGPUDescriptorSetLayout(bindings.data(), bindings.data() + bindings.size());
-                asset::SPushConstantRange range = { asset::ISpecializedShader::ESS_COMPUTE, 0u, sizeof(PushConstant) };
-                m_gpue.cDescriptorSet = driver->createGPUDescriptorSet(std::move(descriptorSetLayout)); // I guess it can be done better
-            }
-
             const core::smart_refctd_ptr<const video::IGPUDescriptorSetLayout> proxy(m_gpue.cPipeline->getLayout()->getDescriptorSetLayout(0));
             m_gpue.cDescriptorSet = core::smart_refctd_ptr(driver->createGPUDescriptorSet(core::smart_refctd_ptr(proxy)));
 
@@ -318,6 +314,22 @@ class IESCompute
             driver->updateDescriptorSets(EB_SIZE, m_gpue.cwrites, 0u, nullptr);
         }
 
+        void updateGDescriptorSets()
+        {
+            fillImageDescriptorInfo<EB_IMAGE_IES_C>(generalPurposeOffset, m_gpue.ginfos[EB_IMAGE_IES_C]);
+            fillImageDescriptorInfo<EB_IMAGE_S>(generalPurposeOffset, m_gpue.ginfos[EB_IMAGE_S]);
+            fillImageDescriptorInfo<EB_IMAGE_D>(generalPurposeOffset, m_gpue.ginfos[EB_IMAGE_D]);
+            fillImageDescriptorInfo<EB_IMAGE_T_MASK>(generalPurposeOffset, m_gpue.ginfos[EB_IMAGE_T_MASK]);
+
+            const core::smart_refctd_ptr<const video::IGPUDescriptorSetLayout> proxy(m_gpue.gPipeline->getLayout()->getDescriptorSetLayout(3));
+            m_gpue.gDescriptorSet = core::smart_refctd_ptr(driver->createGPUDescriptorSet(core::smart_refctd_ptr(proxy)));
+
+            for (auto i = 0; i < m_gpue.NBL_D_IMAGES_AMOUNT; i++)
+                m_gpue.gwrites[i].dstSet = m_gpue.gDescriptorSet.get();
+
+            driver->updateDescriptorSets(m_gpue.NBL_D_IMAGES_AMOUNT, m_gpue.gwrites, 0u, nullptr);
+        }
+
         template<E_BINDINGS binding>
         void fillSSBODescriptorInfo(const size_t assetIndex, IGPUDescriptorSet::SDescriptorInfo& info)
         {
@@ -339,6 +351,29 @@ class IESCompute
             info.buffer = { 0, proxy->getSize() };
         }
 
+        template<E_BINDINGS binding>
+        void fillImageDescriptorInfo(const size_t assetIndex, IGPUDescriptorSet::SDescriptorInfo& info)
+        {
+            static_assert(binding == EB_IMAGE_IES_C || binding == EB_IMAGE_S || binding == EB_IMAGE_D || binding == EB_IMAGE_T_MASK);
+
+            const auto& profile = getProfile(assetIndex);
+            auto& cssbod = m_gpue.CSSBOD[assetIndex];
+
+            core::smart_refctd_ptr<video::IGPUImageView> proxy;
+
+            if constexpr (binding == EB_IMAGE_IES_C)
+                proxy = core::smart_refctd_ptr(cssbod.dImageIESC);
+            else if (binding == EB_IMAGE_S)
+                proxy = core::smart_refctd_ptr(cssbod.dImageS);
+            else if (binding == EB_IMAGE_D)
+                proxy = core::smart_refctd_ptr(cssbod.dImageD);
+            else
+                proxy = core::smart_refctd_ptr(cssbod.dImageTMask);
+
+            info.desc = core::smart_refctd_ptr(proxy);
+            info.image = { core::smart_refctd_ptr(m_gpue.sampler), asset::EIL_SHADER_READ_ONLY_OPTIMAL };
+        }
+
         template<asset::E_FORMAT format>
         auto createGPUImageView(const size_t& width, const size_t& height)
         {
@@ -385,6 +420,19 @@ class IESCompute
             return bindings;
         }
 
+        std::vector<IGPUDescriptorSetLayout::SBinding> getGBindings()
+        {
+            const std::vector<IGPUDescriptorSetLayout::SBinding> bindings =
+            {
+                { EB_IMAGE_IES_C, asset::EDT_COMBINED_IMAGE_SAMPLER, 1, asset::ISpecializedShader::ESS_FRAGMENT, nullptr },
+                { EB_IMAGE_S, asset::EDT_COMBINED_IMAGE_SAMPLER, 1, asset::ISpecializedShader::ESS_FRAGMENT, nullptr },
+                { EB_IMAGE_D, asset::EDT_COMBINED_IMAGE_SAMPLER, 1, asset::ISpecializedShader::ESS_FRAGMENT, nullptr },
+                { EB_IMAGE_T_MASK, asset::EDT_COMBINED_IMAGE_SAMPLER, 1, asset::ISpecializedShader::ESS_FRAGMENT, nullptr }
+            };
+
+            return bindings;
+        }
+
         template<asset::E_FORMAT format>
         video::IFrameBuffer* createFBO(const size_t& width, const size_t& height)
         {
@@ -401,6 +449,8 @@ class IESCompute
 
         struct GPUE
         {
+            _NBL_STATIC_INLINE_CONSTEXPR uint8_t NBL_D_IMAGES_AMOUNT = 4u;
+
             // Compute
             core::smart_refctd_ptr<video::IGPUComputePipeline> cPipeline;
             core::smart_refctd_ptr<video::IGPUDescriptorSet> cDescriptorSet;
@@ -411,6 +461,7 @@ class IESCompute
             struct CSSBODescriptor
             {
                 core::smart_refctd_ptr<video::IGPUBuffer> vAngles, hAngles, data;
+                core::smart_refctd_ptr<video::IGPUImageView> dImageIESC, dImageS, dImageD, dImageTMask;
             };
 
             std::vector<CSSBODescriptor> CSSBOD;
@@ -420,20 +471,19 @@ class IESCompute
             core::smart_refctd_ptr<video::IGPUDescriptorSet> gDescriptorSet;
             core::smart_refctd_ptr<video::IGPUMeshBuffer> mBuffer;
 
+            IGPUDescriptorSet::SDescriptorInfo ginfos[NBL_D_IMAGES_AMOUNT];
+            IGPUDescriptorSet::SWriteDescriptorSet gwrites[NBL_D_IMAGES_AMOUNT];
+
             // Shared data
-            core::smart_refctd_ptr<video::IGPUImageView> dImageIESC;
-            core::smart_refctd_ptr<video::IGPUImageView> dImageS;
-            core::smart_refctd_ptr<video::IGPUImageView> dImageD;
-            core::smart_refctd_ptr<video::IGPUImageView> dImageTMask;
+            core::smart_refctd_ptr<video::IGPUSampler> sampler;
         } m_gpue;
 
         #include "nbl/nblpack.h"
         struct PushConstant
         {
-            float maxIValueReciprocal;
+            float maxIValue;
             float zAngleDegreeRotation;
             IESCompute::E_MODE mode = IESCompute::EM_CDC;
-            uint32_t dummy;
         } PACK_STRUCT;
         #include "nbl/nblunpack.h"
         
@@ -544,38 +594,149 @@ int main()
 
     asset::IAssetLoader::SAssetLoadParams lparams;
     lparams.loaderFlags;
-   
-    constexpr auto IES_INPUTS = std::array
-    { 
-        std::string_view("../../media/mitsuba/ies/ISOTROPIC/007cfb11e343e2f42e3b476be4ab684e.ies"),
-        std::string_view("../../media/mitsuba/ies/ANIISOTROPIC/QUAD_SYMMETRY/0275171fb664c1b3f024d1e442a68d22.ies"),
-        std::string_view("../../media/mitsuba/ies/ANIISOTROPIC/HALF_SYMMETRY/1392a1ba55b67d3e0ae7fd63527f3e78.ies"),
-        std::string_view("../../media/mitsuba/ies/ANIISOTROPIC/OTHER_HALF_SYMMETRY/028e97564391140b1476695ae7a46fa4.ies"),
-        std::string_view("../../media/mitsuba/ies/NO_LATERAL_SYMMET/4b88bf886b39cfa63094e70e1afa680e.ies"),
+
+    auto readJSON = [](const std::string& filePath)
+    {
+        std::ifstream file(filePath.data());
+        if (!file.is_open()) {
+            printf("Invalid input json \"%s\" file! Aborting..", filePath.data());
+            exit(0x45);
+        }
+
+        std::stringstream buffer;
+        buffer << file.rdbuf();
+
+        return buffer.str();
     };
 
+    const auto INPUT_JSON_FILE_PATH_FS = std::filesystem::absolute("../inputs.json");
+    const auto INPUT_JSON_FILE_PATH = INPUT_JSON_FILE_PATH_FS.string();
+    const auto jsonBuffer = readJSON(INPUT_JSON_FILE_PATH);
+    if (jsonBuffer.empty()) {
+        printf("Read input json \"%s\" file is empty! Aborting..\n", INPUT_JSON_FILE_PATH.c_str());
+        exit(0x45);
+    }
+
+    const auto jsonMap = json::parse(jsonBuffer.c_str());
+    
+    if (!jsonMap["directories"].is_array())
+    {
+        printf("Input json \"%s\" file's field \"directories\" is not an array! Aborting..\n", INPUT_JSON_FILE_PATH.c_str());
+        exit(0x45);
+    }
+
+    if (!jsonMap["files"].is_array())
+    {
+        printf("Input json \"%s\" file's field \"files\" is not an array! Aborting..\n", INPUT_JSON_FILE_PATH.c_str());
+        exit(0x45);
+    }
+
+    if (!jsonMap["writeAssets"].is_boolean())
+    {
+        printf("Input json \"%s\" file's field \"writeAssets\" is not a boolean! Aborting..\n", INPUT_JSON_FILE_PATH.c_str());
+        exit(0x45);
+    }
+
+    const auto&& IES_INPUTS = [&]()
+    {
+        std::vector<std::string> inputFilePaths;
+
+        auto addFile = [&inputFilePaths, &INPUT_JSON_FILE_PATH_FS](const std::string_view filePath) -> void
+        {
+            auto path = std::filesystem::path(filePath);
+
+            if (!path.is_absolute())
+                path = std::filesystem::absolute(INPUT_JSON_FILE_PATH_FS.parent_path() / path);
+
+            if (std::filesystem::exists(path) && std::filesystem::is_regular_file(path) && path.extension() == ".ies")
+                inputFilePaths.push_back(path.string());
+            else
+            {
+                printf("Invalid input path \"%s\"! Aborting..\n", path.string().c_str());
+                exit(0x45);
+            }
+        };
+
+        auto addFiles = [&inputFilePaths, &INPUT_JSON_FILE_PATH_FS, &addFile](const std::string_view directoryPath) -> void
+        {
+            auto directory(std::filesystem::absolute(INPUT_JSON_FILE_PATH_FS.parent_path() / directoryPath));
+            if (!std::filesystem::exists(directory) || !std::filesystem::is_directory(directory)) {
+                printf("Invalid input directory \"%s\"! Aborting..\n", directoryPath.data());
+                exit(0x45);
+            }
+
+            for (const auto& entry : std::filesystem::directory_iterator(directory))
+                addFile(entry.path().string().c_str());
+        };
+
+        // parse json
+        {
+            std::vector<std::string_view> jDirectories;
+            jsonMap["directories"].get_to(jDirectories);
+
+            for (const auto& it : jDirectories)
+                addFiles(it);
+
+            std::vector<std::string_view> jFiles;
+            jsonMap["files"].get_to(jFiles);
+
+            for (const auto& it : jFiles)
+                addFile(it);
+        }
+
+        return std::move(inputFilePaths);
+    }();
+
+    const bool GUI = [&]()
+    {
+        bool b = false;
+        jsonMap["gui"].get_to(b);
+
+        return b;
+    }();
+
+    const bool WRITE_ASSETS = [&]()
+    {
+        bool b = false;
+        jsonMap["writeAssets"].get_to(b);
+
+        return b;
+    }();
+   
     const auto ASSETS = [&]()
     {
+        size_t loaded = {}, total = IES_INPUTS.size();
         std::vector<asset::SAssetBundle> assets;
         std::vector<std::string> outStems;
             
-        for (size_t i = 0; i < IES_INPUTS.size(); ++i)
+        for (size_t i = 0; i < total; ++i)
         {
-            auto asset = device->getAssetManager()->getAsset(IES_INPUTS[i].data(), lparams);
-            const auto stem = std::filesystem::path(IES_INPUTS[i].data()).stem().string();
+            auto asset = device->getAssetManager()->getAsset(IES_INPUTS[i].c_str(), lparams);
+            const auto* path = IES_INPUTS[i].c_str();
+            const auto stem = std::filesystem::path(IES_INPUTS[i].c_str()).stem().string();
 
             if (asset.getMetadata())
             {
                 assets.emplace_back(std::move(asset));
                 outStems.push_back(stem);
+                ++loaded;
             }
             else
-                printf("Could not load metadata from \"%s\" asset! Skipping..", stem.c_str());
+                printf("Could not load metadata from \"%s\" asset! Skipping..\n", path);
         }
+        printf("Loaded [%s/%s] assets! Status: %s\n", std::to_string(loaded).c_str(), std::to_string(total).c_str(), loaded == total ? "PASSING" : "FAILING");
 
         return std::make_pair(assets, outStems);
     }();
 
+    if (GUI)
+        printf("GUI Mode: ON\n");
+    else
+    {
+        printf("GUI Mode: OFF\nExiting...");
+        exit(0);
+    }
+
     IESCompute iesComputeEnvironment(driver, am, ASSETS.first);    
     IESExampleEventReceiver receiver;
     device->setEventReceiver(&receiver);
@@ -640,22 +801,23 @@ int main()
         receiver.reset();
     }
 
-    for (size_t i = 0; i < ASSETS.first.size(); ++i)
-    {
-        const auto& bundle = ASSETS.first[i];
-        const auto& stem = ASSETS.second[i];
+    if(WRITE_ASSETS)
+        for (size_t i = 0; i < ASSETS.first.size(); ++i)
+        {
+            const auto& bundle = ASSETS.first[i];
+            const auto& stem = ASSETS.second[i];
 
-        const auto& profile = bundle.getMetadata()->selfCast<const asset::CIESProfileMetadata>()->profile;
-        // const std::string out = std::filesystem::absolute("out/cpu/" + std::string(getProfileRS(profile)) + "/" + stem + ".png").string(); TODO (?): why its not working?
-        const std::string out = std::filesystem::absolute(std::string(getProfileRS(profile)) + "_" + stem + ".png").string();
+            const auto& profile = bundle.getMetadata()->selfCast<const asset::CIESProfileMetadata>()->profile;
+            // const std::string out = std::filesystem::absolute("out/cpu/" + std::string(getProfileRS(profile)) + "/" + stem + ".png").string(); TODO (?): why its not working? ah touch required probably first
+            const std::string out = std::filesystem::absolute(std::string(getProfileRS(profile)) + "_" + stem + ".png").string();
 
-        asset::IAssetWriter::SAssetWriteParams wparams(bundle.getContents().begin()->get());
+            asset::IAssetWriter::SAssetWriteParams wparams(bundle.getContents().begin()->get());
 
-        if (am->writeAsset(out.c_str(), wparams))
-            printf("Saved \"%s\"\n", out.c_str());
-        else
-            printf("Could not write \"%s\"\n", out.c_str());
-    }
+            if (am->writeAsset(out.c_str(), wparams))
+                printf("Saved \"%s\"\n", out.c_str());
+            else
+                printf("Could not write \"%s\"\n", out.c_str());
+        }
 
     return 0;
 }
\ No newline at end of file
diff --git a/50.IESProfileTest/test.ies b/50.IESProfileTest/test.ies
deleted file mode 100644
index 8e00804c3..000000000
--- a/50.IESProfileTest/test.ies
+++ /dev/null
@@ -1,30 +0,0 @@
-IESNA:LM-63-1995
-[TEST] 
-[TESTLAB] BEGA
-[MANUFAC] BEGA
-[MORE] Copyright LUMCat V 
-[LUMCAT] 
-[LUMINAIRE] 84483K3 (Preliminary)
-[ISSUEDATE] 2020-07-22
-[LAMPCAT] LED  24W
-[LAMP]    2500 lm,27 W
-TILT=NONE
-1 -1 1.0 73 1 1 2 -0.485 0.000 0.130
-1.0 1.0 27
-   0.0   2.5   5.0   7.5  10.0  12.5  15.0  17.5  20.0  22.5  25.0  27.5  30.0
-  32.5  35.0  37.5  40.0  42.5  45.0  47.5  50.0  52.5  55.0  57.5  60.0  62.5
-  65.0  67.5  70.0  72.5  75.0  77.5  80.0  82.5  85.0  87.5  90.0  92.5  95.0
-  97.5 100.0 102.5 105.0 107.5 110.0 112.5 115.0 117.5 120.0 122.5 125.0 127.5
- 130.0 132.5 135.0 137.5 140.0 142.5 145.0 147.5 150.0 152.5 155.0 157.5 160.0
- 162.5 165.0 167.5 170.0 172.5 175.0 177.5 180.0
-   0.0
-     688.3     686.8     684.0     680.3     675.3     668.8     660.9     650.7
-     638.6     624.9     609.6     593.0     575.2     556.3     536.5     516.3
-     495.7     475.4     455.7     436.5     417.0     397.4     378.0     359.1
-     340.8     322.9     305.3     287.9     270.9     253.9     237.1     220.5
-     204.1     187.8     171.8     156.0     140.5     125.4     110.7      96.4
-      82.6      69.3      56.5      44.6      33.6      23.7      15.3       8.7
-       4.1       1.4       0.3       0.1       0.1       0.0       0.0       0.0
-       0.0       0.0       0.0       0.0       0.1       0.2       0.4       0.6
-       1.0       1.3       1.4       1.4       1.4       1.3       0.8       0.5
-       0.4
diff --git a/media b/media
index 6f5346ff8..ad2cb3a9a 160000
--- a/media
+++ b/media
@@ -1 +1 @@
-Subproject commit 6f5346ff8f20f0bedeaa9c58a715ab4d6fce661c
+Subproject commit ad2cb3a9a1655c5c4d0ffa1c515f710568f0487d