diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td index 73d86283a5940..3f1f655c041f2 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td @@ -546,12 +546,14 @@ def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> { }]; let arguments = (ins AnyUnrankedMemRef:$tensor, - Variadic:$boxDimensions); + Variadic:$boxDimensions, + DenseI64ArrayAttr:$static_boxDimensions); let results = (outs NVGPU_TensorMapDescriptor:$tensorMap); let assemblyFormat = [{ - $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap) + $tensor `box` custom($boxDimensions, $static_boxDimensions) attr-dict `:` type($tensor) `->` type($tensorMap) }]; let hasVerifier = 1; + let hasFolder = 1; } def NVGPU_WarpgroupGenerateDescriptorOp : NVGPU_Op<"warpgroup.generate.descriptor", []> { diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index 80b3d85488495..2bac3e4f24db1 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -1184,9 +1184,17 @@ struct NVGPUTmaCreateDescriptorOpLowering Value boxArrayPtr = b.create(llvmPointerType, llvmInt64Type, makeI64Const(b, 5)); - for (auto [index, value] : llvm::enumerate(adaptor.getBoxDimensions())) { + unsigned idx = 0; + ValueRange dynamicDim = adaptor.getBoxDimensions(); + for (auto [index, shape] : + llvm::enumerate(adaptor.getStaticBoxDimensions())) { Value gep = b.create(llvmPointerType, llvmPointerType, boxArrayPtr, makeI64Const(b, index)); + Value value; + if (ShapedType::isDynamic(shape)) + value = dynamicDim[idx++]; + else + value = makeI64Const(b, shape); b.create(value, gep); } diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp index abbdb6a0f53ec..9e13f9df9b913 100644 --- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp +++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp @@ -23,6 +23,7 @@ #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/IR/Verifier.h" +#include "mlir/Interfaces/ViewLikeInterface.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TypeSwitch.h" @@ -458,6 +459,10 @@ LogicalResult TmaAsyncStoreOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// NVGPU_TmaCreateDescriptorOp +//===----------------------------------------------------------------------===// + LogicalResult TmaCreateDescriptorOp::verify() { if (getBoxDimensions().size() > kMaxTMATensorDimension) { return emitError() << "Maximum " << kMaxTMATensorDimension @@ -472,6 +477,48 @@ LogicalResult TmaCreateDescriptorOp::verify() { return success(); } +static Value +TmaCreateDescriptorFoldBoxConstant(TmaCreateDescriptorOp op, + TmaCreateDescriptorOp::FoldAdaptor adaptor) { + std::vector staticBoxDimensions = op.getStaticBoxDimensions().vec(); + OperandRange dynamicBoxDimensions = op.getBoxDimensions(); + SmallVector operands = {op.getTensor()}; + ArrayRef dynamicBoxDimensionAttrs = adaptor.getBoxDimensions(); + if (staticBoxDimensions.empty()) + return {}; + + // `opChange` is a flag. If it is true, it means to update `op` in place. + bool opChange = false; + unsigned idx = 0; + + for (unsigned i = 0, e = staticBoxDimensions.size(); i < e; ++i) { + if (!ShapedType::isDynamic(staticBoxDimensions[i])) + continue; + Attribute dynamicBoxDimensionAttr = dynamicBoxDimensionAttrs[idx]; + Value dynamicDimension = dynamicBoxDimensions[idx++]; + if (auto attr = + mlir::dyn_cast_if_present(dynamicBoxDimensionAttr)) { + staticBoxDimensions[i] = attr.getInt(); + opChange = true; + continue; + } + operands.push_back(dynamicDimension); + } + + if (opChange) { + op.setStaticBoxDimensions(staticBoxDimensions); + op.getOperation()->setOperands(operands); + return op.getResult(); + } + return {}; +} + +OpFoldResult TmaCreateDescriptorOp::fold(FoldAdaptor adaptor) { + if (auto val = TmaCreateDescriptorFoldBoxConstant(*this, adaptor)) + return val; + return OpFoldResult(); +} + //===----------------------------------------------------------------------===// // NVGPU_WarpgroupGenerateDescriptorOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp index d2c94b124cdfb..f23bc754eb430 100644 --- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp +++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp @@ -965,6 +965,7 @@ HopperBuilder::buildGlobalMemRefDescriptor(TypedValue memref, SmallVector sizes = getValueOrCreateConstantIndexOp(rewriter, loc, mixedSizes); + SmallVector static_dims(sizes.size(), ShapedType::kDynamic); auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter); Value desc = rewriter.create( loc, @@ -975,7 +976,7 @@ HopperBuilder::buildGlobalMemRefDescriptor(TypedValue memref, TensorMapSwizzleKind::SWIZZLE_NONE, TensorMapL2PromoKind::L2PROMO_NONE, TensorMapOOBKind::OOB_ZERO, TensorMapInterleaveKind::INTERLEAVE_NONE), - unrankedMemRef, sizes); + unrankedMemRef, sizes, static_dims); return cast>(desc); } diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index d0bc806e0aa8c..ffb7e62f250b0 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -813,6 +813,32 @@ func.func @create_tensor_map(%devicePtr2d : memref<64x128xf32>, %devicePtr1d : m func.return } +func.func @create_tensor_map_constant_box_dim(%devicePtr2d : memref<64x128xf32>, %devicePtr1d : memref<128xf32>) { + %devicePtr2d_unranked = memref.cast %devicePtr2d : memref<64x128xf32> to memref<*xf32> + // CHECK: %[[C5_0:.*]] = llvm.mlir.constant(5 : i32) : i64 + // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[C5_0]] x i64 : (i64) -> !llvm.ptr + // CHECK: %[[C0_0:.*]] = llvm.mlir.constant(0 : i32) : i64 + // CHECK: %[[GEP_0:.*]] = llvm.getelementptr %[[ALLOCA]]{{\[}}%[[C0_0]]] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr + // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : i32) : i64 + // CHECK: llvm.store %[[C64]], %[[GEP_0]] : i64, !llvm.ptr + // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i64 + // CHECK: %[[GEP_1:.*]] = llvm.getelementptr %[[ALLOCA]]{{\[}}%[[C1]]] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr + // CHECK: %[[C128_0:.*]] = llvm.mlir.constant(128 : i32) : i64 + // CHECK: llvm.store %[[C128_0]], %[[GEP_1]] : i64, !llvm.ptr + // CHECK: llvm.call @mgpuTensorMapEncodeTiledMemref({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[ALLOCA]]) + %tensorMap2d = nvgpu.tma.create.descriptor %devicePtr2d_unranked box[64, 128] : memref<*xf32> -> !tensorMap2d + %devicePtr1d_unranked = memref.cast %devicePtr1d : memref<128xf32> to memref<*xf32> + // CHECK: %[[C5_1:.*]] = llvm.mlir.constant(5 : i32) : i64 + // CHECK: %[[ALLOCA_1:.*]] = llvm.alloca %[[C5_1]] x i64 : (i64) -> !llvm.ptr + // CHECK: %[[C0_1:.*]] = llvm.mlir.constant(0 : i32) : i64 + // CHECK: %[[GEP_2:.*]] = llvm.getelementptr %[[ALLOCA_1]]{{\[}}%[[C0_1]]] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr + // CHECK: %[[C128_1:.*]] = llvm.mlir.constant(128 : i32) : i64 + // CHECK: llvm.store %[[C128_1]], %[[GEP_2]] : i64, !llvm.ptr + // CHECK: llvm.call @mgpuTensorMapEncodeTiledMemref({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[ALLOCA_1]]) + %tensorMap1d = nvgpu.tma.create.descriptor %devicePtr1d_unranked box[128] : memref<*xf32> -> !tensorMap1d + func.return +} + // CHECK-LABEL: @tma_prefetch( // CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !nvgpu.tensormap.descriptor, swizzle = none, l2promo = none, oob = nan, interleave = none>, %[[arg1:[a-zA-Z0-9_]+]]: i1 func.func @tma_prefetch(%tensorMap1d: !tensorMap1d, %p : i1) { diff --git a/mlir/test/Dialect/NVGPU/canonicalization.mlir b/mlir/test/Dialect/NVGPU/canonicalization.mlir index a7fbfd8067395..9939461769c30 100644 --- a/mlir/test/Dialect/NVGPU/canonicalization.mlir +++ b/mlir/test/Dialect/NVGPU/canonicalization.mlir @@ -27,4 +27,19 @@ gpu.module @main_kernel { nvvm.cp.async.bulk.wait_group 0 gpu.return } -} \ No newline at end of file +} + +// ----- + +!descriptor = !nvgpu.tensormap.descriptor, swizzle = none, l2promo=none, oob=zero, interleave=none> + +func.func @main() { + %a_host = memref.alloc() : memref<64x16xf16> + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %a_device = gpu.alloc() : memref<64x16xf16> + %a_device_unranked = memref.cast %a_device : memref<64x16xf16> to memref<*xf16> + // CHECK: nvgpu.tma.create.descriptor %{{.*}} box [64, 16] + %a_device_map = nvgpu.tma.create.descriptor %a_device_unranked box[%c64, %c16] : memref<*xf16> -> !descriptor + return +} diff --git a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir index 40acd82cd0558..aa981b2688b81 100644 --- a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir +++ b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir @@ -18,12 +18,12 @@ func.func @main() { // CHECK: %[[M1:.*]] = memref.cast %{{.*}} : memref<64x32xf32> to memref<*xf32> // CHECK: %[[c64:.*]] = arith.constant 64 : index // CHECK: %[[c32:.*]] = arith.constant 32 : index - // CHECK: %[[D1:.*]] = nvgpu.tma.create.descriptor %[[M1]] box[%[[c64]], %[[c32]]] + // CHECK: %[[D1:.*]] = nvgpu.tma.create.descriptor %[[M1]] box [%[[c64]], %[[c32]]] // CHECK-SAME: : memref<*xf32> -> >, swizzle = none, l2promo = none, oob = zero, interleave = none> // CHECK: %[[cast_2:.*]] = memref.cast %memref_0 : memref<8x32xf32> to memref<*xf32> // CHECK: %[[c8_2:.*]] = arith.constant 8 : index // CHECK: %[[c32_2:.*]] = arith.constant 32 : index - // CHECK: %[[D2:.*]] = nvgpu.tma.create.descriptor %cast_2 box[%[[c8_2]], %[[c32_2]]] + // CHECK: %[[D2:.*]] = nvgpu.tma.create.descriptor %cast_2 box [%[[c8_2]], %[[c32_2]]] // CHECK-SAME: : memref<*xf32> -> >, swizzle = none, l2promo = none, oob = zero, interleave = none> // CHECK: gpu.launch gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) diff --git a/mlir/test/Examples/NVGPU/tools/nvdsl.py b/mlir/test/Examples/NVGPU/tools/nvdsl.py index 90dbb2355e1c8..9aa385a21f601 100644 --- a/mlir/test/Examples/NVGPU/tools/nvdsl.py +++ b/mlir/test/Examples/NVGPU/tools/nvdsl.py @@ -143,8 +143,12 @@ def create_descriptor(self, device_ptr): ), device_ptr, ) + box_static_dim = [MLIR_DYNAMIC] * len(self.tma_box_shape) self.tma_descriptor = nvgpu.TmaCreateDescriptorOp( - tma_descriptor_ty, device_unranked_memref, map(const, self.tma_box_shape) + tma_descriptor_ty, + device_unranked_memref, + map(const, self.tma_box_shape), + box_static_dim, ) return self.tma_descriptor.result diff --git a/mlir/test/Integration/GPU/CUDA/sm90/python/tools/matmulBuilder.py b/mlir/test/Integration/GPU/CUDA/sm90/python/tools/matmulBuilder.py index 5394d4a327255..7426e00ac7fc0 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/python/tools/matmulBuilder.py +++ b/mlir/test/Integration/GPU/CUDA/sm90/python/tools/matmulBuilder.py @@ -64,8 +64,12 @@ def tma_descriptor_op(self, device_ptr): ), device_ptr, ) + box_static_dim = [MLIR_DYNAMIC] * len(self.tma_box_shape) tma_descriptor_op = nvgpu.TmaCreateDescriptorOp( - tma_descriptor_ty, device_unranked_memref, map(c, self.tma_box_shape) + tma_descriptor_ty, + device_unranked_memref, + map(c, self.tma_box_shape), + box_static_dim, ) return tma_descriptor_op.result