diff --git a/CHANGELOG.md b/CHANGELOG.md index 5421cdf526..0d5ad1fffe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid ## Unreleased (Planned Release Target: v0.2.1) ### List of Pull Requests +- Add tile transfer annotation [#127](https://github.com/pulp-platform/Deeploy/pull/127) - Refactor Logging for Improved Debugging [#115](https://github.com/pulp-platform/Deeploy/pull/115) - Add reuse-tool as an SPDX license header linter [#113](https://github.com/pulp-platform/Deeploy/pull/113) - Bug fixes, API Cleanup and Reduce Compiler Warning on PULP [#112](https://github.com/pulp-platform/Deeploy/pull/112) @@ -46,6 +47,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Buffer utilities: `checkNumLevels` validation and `sizeInBytes` method - Per–memory-level usage tracking and worst-case reporting in `NetworkContext` - Memory/I/O summaries and input/output logging in deployers +- Added transfer annotation of tiled execution blocks ### Changed - Replaced platform-specific tags (`*-amd64`, `*-arm64`) with direct digest references in `Noelware/docker-manifest-action`. @@ -73,6 +75,8 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Changed types and added correct casts to fix many compiler warnings in the PULP target library - Use [reuse-tool](https://github.com/fsfe/reuse-tool) in pre-commit, CI, and Makefile for SPDX license header linting - Deployer workflow now uses `prepare(...)` instead of `generateFunction(...)`. +- Refactored computeTilingRectangles +- wrapTilingSolution now uses the transfer annotation ### Fixed - Prevent node duplication for graphs generated via GraphSurgeon @@ -83,6 +87,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Corrected method usage in `importDeeployState` to call `NetworkContext.importNetworkContext` instead of the incorrect method name - Correctly return `signProp` from `setupDeployer` instead of hardcoding the value to `False` in `testMVP.py` - Fixed `Unsqueeze` Op. when using ONNX opset 13 or higher (from attribute to input) +- Fixed compiler warning by casting the external pointer in L3Dma to uint32_t ### Removed - Delete outdated and unused `.gitlab-ci.yml` file diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py index b73fcafe31..e392e3355b 100644 --- a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py +++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py @@ -130,6 +130,8 @@ def apply(self, ctxt._dynamicSize[memoryLevel] += int(buffer.sizeInBytes()) executionBlock.addLeft(buffer.allocTemplate, buffer._bufferRepresentation()) + if isinstance(buffer, TransientBuffer): + executionBlock.addLeft(buffer.initTemplate, buffer._bufferRepresentation()) for levels in ctxt._dynamicSize.keys(): if levels not in ctxt._maxDynamicSize: diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index e6ca25c9bd..3783503931 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -1458,6 +1458,7 @@ def __init__(self, operatorCodeSnippet: Optional[CodeSnippet] = None): ) #: Sequence[CodeSnippet]: ordered list of code snippets that need to be generated to implemented the associated operator self.patternMemoryConstraint: Optional = None #: Optional[PatternMemoryConstraint]: Tiling information of the operator which is annotated in the midend + self.transfers: Optional = None #: Optional[Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]]]: Tiling transfers def addLeft(self, template: NodeTemplate, operatorRepresentation: OperatorRepresentation): """Adds a code snippet that is generated BEFORE any of the other code snippets in this ExecutionBlock @@ -2892,7 +2893,7 @@ def generateInferenceInitializationCode(self) -> str: callStack = '' for node in self.ctxt.localObjects.values(): # WIESEP: We don't want to initialize the struct buffers as this should be handled by the ArgumentStructGeneration - if isinstance(node, StructBuffer): + if isinstance(node, (StructBuffer, TransientBuffer)): continue name = node.name diff --git a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py index 849db08576..b7b8787f49 100644 --- a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py +++ b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py @@ -22,7 +22,7 @@ class L3Dma(AsyncDma): _transferTemplates = { 2: NodeTemplate( - "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" + "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t)${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" ) } _waitingStrategy = PerTensorWaitingStrategy(L3DmaFuture) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py index 0db3109aea..1914083f94 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py @@ -5,7 +5,7 @@ import copy import math from abc import abstractmethod -from typing import List, Optional, Tuple, TypeVar +from typing import Dict, List, Optional, Tuple, TypeVar import numpy as np @@ -19,8 +19,10 @@ from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import TilingHoistingMixIn from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PrototypeTilingMixIn from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint -from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme, \ - calculateFlatOffset, minimizeRectangle, minimizeVariableReplacement, padOffset, padShape, stridesFromShape +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme, calculateFlatOffset, minimizeRectangle, minimizeVariableReplacement, padOffset, \ + padShape, stridesFromShape T = TypeVar('T') @@ -241,8 +243,18 @@ def apply(self, assert isinstance(buffer, VariableBuffer) unraveledOpRepr[key] = ctxt.unravelReference(buffer).name - variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution( - nodeMemoryConstraint, self.localMemory, ctxt, unraveledOpRepr) + tileConstr: TileConstraint = template.tileConstraint + transfers: Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]] = baseExecutionBlock.transfers + targetMemoryTransfers = { + tensorName: memTransfers.get(self.localMemory, None) for tensorName, memTransfers in transfers.items() + } + + if any(v is None for v in targetMemoryTransfers.values()): + return ctxt, executionBlock + + variableReplacement, tilingSchedules = tileConstr.wrapTilingSolution(nodeMemoryConstraint, self.localMemory, + ctxt, unraveledOpRepr, + targetMemoryTransfers) minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py index 76eacd10dd..cbc0ce57cc 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py @@ -4,7 +4,7 @@ import copy import itertools -from typing import List, Tuple +from typing import Dict, List, Tuple from Deeploy.AbstractDataTypes import Struct from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureExecutionBlock @@ -15,8 +15,10 @@ _ReferenceBuffer from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import TilingHoistingMixIn from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint from Deeploy.TilingExtension.TilerExtension import Tiler -from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, minimizeVariableReplacement +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme, \ + minimizeVariableReplacement class TilingVariableReplacement(CodeTransformationPass, IntrospectiveCodeTransformationMixIn, TilingHoistingMixIn): @@ -133,8 +135,18 @@ def apply(self, for key, value in operatorRepresentation.items() } - variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution( - nodeMemoryConstraint, self.targetMemLevel, ctxt, unraveledOpRepr) + tileConstr: TileConstraint = template.tileConstraint + transfers: Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]] = baseExecutionBlock.transfers + targetMemoryTransfers = { + tensorName: memTransfers.get(self.targetMemLevel, None) for tensorName, memTransfers in transfers.items() + } + + if any(v is None for v in targetMemoryTransfers.values()): + return ctxt, executionBlock + + variableReplacement, tilingSchedules = tileConstr.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel, + ctxt, unraveledOpRepr, + targetMemoryTransfers) minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation) operatorRepresentation.update(newOpRepr) @@ -233,8 +245,17 @@ def apply(self, for key, value in operatorRepresentation.items() } - variableReplacement, _ = template.tileConstraint.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel, - ctxt, unraveledOpRepr) + tileConstr: TileConstraint = template.tileConstraint + transfers: Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]] = baseExecutionBlock.transfers + targetMemoryTransfers = { + tensorName: memTransfers.get(self.targetMemLevel, None) for tensorName, memTransfers in transfers.items() + } + + if any(v is None for v in targetMemoryTransfers.values()): + return ctxt, executionBlock + + variableReplacement, _ = tileConstr.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel, ctxt, + unraveledOpRepr, targetMemoryTransfers) minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation) operatorRepresentation.update(newOpRepr) diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py index 5b067b2ce9..09c0d6e8a7 100644 --- a/Deeploy/TilingExtension/TileConstraint.py +++ b/Deeploy/TilingExtension/TileConstraint.py @@ -2,18 +2,15 @@ # # SPDX-License-Identifier: Apache-2.0 -import copy from abc import abstractmethod from typing import Dict, List, Optional, Tuple, Union -import numpy as np from ortools.constraint_solver.pywrapcp import IntVar from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint, NodeMemoryConstraint, TensorMemoryConstraint +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TilerModel import TilerModel -from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, MemoryTransfer, \ - TilingSchedule, VariableReplacementScheme, computeTileHyperRectangles +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme class TileConstraint(): @@ -91,81 +88,17 @@ def sanitizeTilingSchedule(tilingSchedule: TilingSchedule) -> TilingSchedule: @classmethod def wrapTilingSolution( - cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext, - operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: - - def getMemoryTransfer(tensorConstraint: TensorMemoryConstraint, sourceCube: HyperRectangle, - sourceMemoryLevel: str, targetMemoryLevel: str) -> MemoryTransfer: - - size = np.prod(sourceCube.dims) - sourceConstraint = MemoryConstraint(sourceMemoryLevel, size) - sourceConstraint.shape = sourceCube.dims - - destConstraint = copy.copy(tensorConstraint.memoryConstraints[targetMemoryLevel]) - - if any(dim1 > dim2 for dim1, dim2 in zip(destConstraint.shape, sourceConstraint.shape)): - destConstraint.shape = sourceConstraint.shape - - return MemoryTransfer(sourceConstraint, destConstraint) - - def _offsetAdd(offsetA: Tuple[int, ...], offsetB: Tuple[int, ...]) -> Tuple[int, ...]: - return tuple(dimA + dimB for dimA, dimB in zip(offsetA, offsetB)) - - def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List[AbsoluteHyperRectangle], - sourceMemoryLevel: str, - targetMemoryLevel: str) -> Tuple[List[AbsoluteHyperRectangle], List[int]]: - solution = [] - solutionLengths = [] - - for sourceCube in sourceCubes: - memTransfer = getMemoryTransfer(tensorConstraint, sourceCube.rectangle, sourceMemoryLevel, - targetMemoryLevel) - solutionCubes = computeTileHyperRectangles(memTransfer) - solutionAbsoluteCubes = [ - AbsoluteHyperRectangle(rectangle = cube, - absoluteOffset = _offsetAdd(sourceCube.absoluteOffset, cube.offset)) - for cube in solutionCubes - ] - solution += solutionAbsoluteCubes - solutionLengths.append(len(solutionAbsoluteCubes)) - - return solution, solutionLengths - + cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation, + transfers: Dict[str, + List[List[AbsoluteHyperRectangle]]]) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: assert len(tilingSolution.outputTensorMemoryConstraints) == 1, "Expected node to have only one output!" - - outVar, outTensorConstraint = next(iter(tilingSolution.outputTensorMemoryConstraints.items())) - memoryPath = list(outTensorConstraint.memoryConstraints.keys()) - - assert targetMemLevel in memoryPath, \ - f"Target memory level {targetMemLevel} does not exist in the memory path {memoryPath}" - - targetIdx = memoryPath.index(targetMemLevel) - - if targetIdx == 0: - # SCHEREMO: Watch out - this happens if inputs are in L(N+1) but outputs only in L(N) - targetIdx = 1 - - fullShape = ctxt.lookup(outVar).shape - initialOffset = (0,) * len(fullShape) - outputCubes = [ - AbsoluteHyperRectangle(rectangle = HyperRectangle(offset = initialOffset, dims = tuple(fullShape)), - absoluteOffset = initialOffset) - ] - - for source, target in zip(memoryPath[:targetIdx], memoryPath[1:targetIdx + 1]): - outputCubes, solutionLengths = getCubeTransfers(outTensorConstraint, outputCubes, source, target) - - arrayOfCubes = [] - _idx = 0 - for idxLen in solutionLengths: - arrayOfCubes += [outputCubes[_idx:_idx + idxLen]] - _idx += idxLen + outVar, _ = next(iter(tilingSolution.outputTensorMemoryConstraints.items())) varReplacements = [] tilingSchedules = [] - for _outputCubes in arrayOfCubes: - + for _outputCubes in transfers[outVar]: varReplacement, tilingSchedule = cls.serializeTilingSolution(tilingSolution, _outputCubes, targetMemLevel, ctxt, operatorRepresentation) sanitizedTilingSchedule = cls.sanitizeTilingSchedule(tilingSchedule) diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py index bdae0fbdcf..87884837fc 100644 --- a/Deeploy/TilingExtension/TilerExtension.py +++ b/Deeploy/TilingExtension/TilerExtension.py @@ -36,6 +36,7 @@ from Deeploy.TilingExtension.MemoryScheduler import MemoryBlock, MemoryScheduler from Deeploy.TilingExtension.TileConstraint import TileConstraint from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, computeTileHyperRectangles TilingSolution = List[PatternMemoryConstraints] MemoryMap = Dict[str, List[List[MemoryBlock]]] @@ -940,6 +941,34 @@ def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]] assert stepIdx in range(lifetime[0], lifetime[-1] + 1), f"Invalid memory map! Buffer {tensor.name} is not alive at step {stepIdx}!" + def getTransfers(self, tensorMc: TensorMemoryConstraint) -> Dict[str, List[List[AbsoluteHyperRectangle]]]: + transfers: Dict[str, List[List[AbsoluteHyperRectangle]]] = {} + mcs = list(tensorMc.memoryConstraints.items()) + for (externalMemory, externalMc), (localMemory, localMc) in zip(mcs[:-1], mcs[1:]): + # TODO: Should we also use externalMemory as a key in the transfers? + if externalMemory not in transfers: + assert externalMc.shape is not None + shape = externalMc.shape + zeroOffset = (0,) * len(shape) + externalAbsoluteRectangles = [AbsoluteHyperRectangle(HyperRectangle(zeroOffset, shape), zeroOffset)] + else: + # Flatten + externalAbsoluteRectangles = [rect for _list in transfers[externalMemory] for rect in _list] + + transfers[localMemory] = [[ + AbsoluteHyperRectangle(rect, tuple(a + b + for a, b in zip(extAbsRect.absoluteOffset, rect.offset))) + for rect in computeTileHyperRectangles(extAbsRect.rectangle.dims, localMc.shape) + ] + for extAbsRect in externalAbsoluteRectangles] + return transfers + + def getIoTransfers(self, + patternMc: PatternMemoryConstraints) -> Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]]: + assert len(patternMc.nodeConstraints) == 1, "Only layerwise supported for now!" + tMcs = patternMc.nodeConstraints[0].tensorMemoryConstraints + return {name: self.getTransfers(mc) for name, mc in tMcs.items()} + class TilerDeployerWrapper(NetworkDeployerWrapper): @@ -996,6 +1025,7 @@ def tile(self, tilingSolution: Optional[TilingSolution] = None, memoryMap: Optio # SCHEREMO: Annotate execution block with solution for layer, pattern in zip(self.layerBinding.values(), tilingSolution): layer.mapper.binder.executionBlock.patternMemoryConstraint = pattern + layer.mapper.binder.executionBlock.transfers = self.tiler.getIoTransfers(pattern) # SCHEREMO: Code generation STUB diff --git a/Deeploy/TilingExtension/TilingCodegen.py b/Deeploy/TilingExtension/TilingCodegen.py index 604ba23c9d..40dc975ee6 100644 --- a/Deeploy/TilingExtension/TilingCodegen.py +++ b/Deeploy/TilingExtension/TilingCodegen.py @@ -11,13 +11,6 @@ from Deeploy.AbstractDataTypes import Pointer from Deeploy.DeeployTypes import OperatorRepresentation, VariableBuffer -from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint - - -@dataclass -class MemoryTransfer(): - source: MemoryConstraint - destination: MemoryConstraint @dataclass @@ -242,18 +235,12 @@ def calculateFlatOffsetInBytes(tile: HyperRectangle, referenceBuffer: VariableBu (referenceBuffer._type.referencedType.typeWidth // 8)) -def computeTileHyperRectangles(memoryTransfer: MemoryTransfer) -> List[HyperRectangle]: - assert memoryTransfer.source.shape is not None, "Source transfer shape cannot be undefined!" - assert memoryTransfer.destination.shape is not None, "Destination transfer shape cannot be undefined!" - - assert len(memoryTransfer.source.shape) == len(memoryTransfer.destination.shape), \ - f"Source and target of memory transfer {memoryTransfer} don't have the same number of dimensions!" - - largeShape = memoryTransfer.source.shape - smallShape = memoryTransfer.destination.shape +def computeTileHyperRectangles(externalShape: Tuple[int, ...], localShape: Tuple[int, ...]) -> List[HyperRectangle]: + assert len(externalShape) == len(localShape), \ + f"External and local memory shapes don't have the same number of dimensions! External {externalShape} vs. Local {localShape}" - for dimIdx, (dimSizeSmall, dimSizeLarge) in enumerate(zip(smallShape, largeShape)): - assert dimSizeSmall <= dimSizeLarge, f"smallShape[{dimIdx}] should not be bigger then largeShape[{dimIdx}]. ({dimSizeSmall} > {dimSizeLarge})" + # LMACAN: The local shape dimensions are of the local buffer so if the external tile is smaller, that's fine + localShape = tuple(min(ext, loc) for ext, loc in zip(externalShape, localShape)) def nextTileIndex(tileIndexEnd: List[int]) -> Generator[List[int]]: tileCount = np.prod(tileIndexEnd) @@ -270,18 +257,18 @@ def nextTileIndex(tileIndexEnd: List[int]) -> Generator[List[int]]: tileHyperRectangles = [] tileIndexEnd = [ - int(np.ceil(dimSizeLarge / dimSizeSmall)) for dimSizeLarge, dimSizeSmall in zip(largeShape, smallShape) + int(np.ceil(dimSizeLarge / dimSizeSmall)) for dimSizeLarge, dimSizeSmall in zip(externalShape, localShape) ] for tileIndex in nextTileIndex(tileIndexEnd): - tileOffset = tuple(dimIdx * dimSizeSmall for dimIdx, dimSizeSmall in zip(tileIndex, smallShape)) - for dimIdx, (dimOffset, dimSizeLarge) in enumerate(zip(tileOffset, largeShape)): + tileOffset = tuple(dimIdx * dimSizeSmall for dimIdx, dimSizeSmall in zip(tileIndex, localShape)) + for dimIdx, (dimOffset, dimSizeLarge) in enumerate(zip(tileOffset, externalShape)): assert dimOffset >= 0, f"tileOffset[{dimIdx}] shoud not be smaller then zero ({dimOffset} < 0)" assert dimOffset < dimSizeLarge, f"tileOffset[{dimIdx}] should not be bigger or equal then largeShape[{dimIdx}] ({dimOffset} >= {dimSizeLarge})" tileSize = tuple( min(dimSizeSmall, dimSizeLarge - dimOffset) - for dimSizeSmall, dimSizeLarge, dimOffset in zip(smallShape, largeShape, tileOffset)) - for dimIdx, (dimSize, dimSizeSmall) in enumerate(zip(tileSize, smallShape)): + for dimSizeSmall, dimSizeLarge, dimOffset in zip(localShape, externalShape, tileOffset)) + for dimIdx, (dimSize, dimSizeSmall) in enumerate(zip(tileSize, localShape)): assert dimSize > 0, f"tileOffset[{dimIdx}] shoud not be smaller or equal then zero ({dimSize} <= 0)" assert dimSize <= dimSizeSmall, f"tileSize[{dimIdx}] should not be bigger then smallShape[{dimIdx}] ({dimSize} > {dimSizeSmall})"