-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[mlir] gpu async region pass: do not add async if gpu.alloc is #90876
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Sang Ik Lee (silee2) Changeshost_shared. gpu.alloc host_shared is not an async operation. Update gpu async region pass to handle this case correctly. Full diff: https://github.com/llvm/llvm-project/pull/90876.diff 8 Files Affected:
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 3a4fc7d8063f40..66d9ce7cfc5117 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -873,18 +873,33 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::DeallocOp deallocOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
+ bool isHostShared = false;
+ auto *op = deallocOp.getOperation();
+ for (auto operand : op->getOperands()) {
+ gpu::AllocOp allocOp = operand.getDefiningOp<gpu::AllocOp>();
+ if (!allocOp)
+ continue;
+ if (allocOp.getHostShared())
+ isHostShared = true;
+ }
+
if (failed(areAllLLVMTypes(deallocOp, adaptor.getOperands(), rewriter)) ||
- failed(isAsyncWithOneDependency(rewriter, deallocOp)))
+ (!isHostShared && failed(isAsyncWithOneDependency(rewriter, deallocOp))))
return failure();
Location loc = deallocOp.getLoc();
Value pointer =
MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
- Value stream = adaptor.getAsyncDependencies().front();
+ Value stream = isHostShared
+ ? rewriter.create<mlir::LLVM::ZeroOp>(loc, llvmPointerType)
+ : adaptor.getAsyncDependencies().front();
deallocCallBuilder.create(loc, rewriter, {pointer, stream});
- rewriter.replaceOp(deallocOp, {stream});
+ if (isHostShared)
+ rewriter.eraseOp(deallocOp);
+ else
+ rewriter.replaceOp(deallocOp, {stream});
return success();
}
diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
index b2fa3a99c53fc3..700a22822c5ab9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
@@ -73,6 +73,17 @@ struct GpuAsyncRegionPass::ThreadTokenCallback {
LogicalResult visit(Operation *op) {
if (isa<gpu::LaunchOp>(op))
return op->emitOpError("replace with gpu.launch_func first");
+ // gpu.alloc() host_shared cannot be done async
+ if (isa<gpu::AllocOp>(op)) {
+ gpu::AllocOp allocOp = dyn_cast<gpu::AllocOp>(op);
+ if (allocOp.getHostShared())
+ return success();
+ }
+ if (isa<gpu::DeallocOp>(op)) {
+ gpu::AllocOp allocOp = op->getOperand(0).getDefiningOp<gpu::AllocOp>();
+ if (allocOp.getHostShared())
+ return success();
+ }
if (auto waitOp = llvm::dyn_cast<gpu::WaitOp>(op)) {
if (currentToken)
waitOp.addAsyncDependency(currentToken);
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
index c250340c38fc77..6653850fa08bb8 100644
--- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -90,7 +90,13 @@ static void *allocDeviceMemory(sycl::queue *queue, size_t size, bool isShared) {
}
static void deallocDeviceMemory(sycl::queue *queue, void *ptr) {
- sycl::free(ptr, *queue);
+ if (queue == nullptr) {
+ queue = new sycl::queue(getDefaultContext(), getDefaultDevice());
+ sycl::free(ptr, *queue);
+ delete queue;
+ } else {
+ sycl::free(ptr, *queue);
+ }
}
static ze_module_handle_t loadModule(const void *data, size_t dataSize) {
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
index ae8b7aaac7fd94..74661a0b89b1d1 100644
--- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -29,12 +29,10 @@ module attributes {gpu.container_module} {
// CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
// CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[nullptr]], %[[isHostShared]])
%0 = gpu.alloc host_shared (%size) : memref<?xf32>
- // CHECK: %[[stream:.*]] = llvm.call @mgpuStreamCreate()
- %1 = gpu.wait async
- %2 = gpu.dealloc async [%1] %0 : memref<?xf32>
- // CHECK: llvm.call @mgpuStreamSynchronize(%[[stream]])
- // CHECK: llvm.call @mgpuStreamDestroy(%[[stream]])
- gpu.wait [%2]
+ // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
+ // CHECK: %[[stream:.*]] = llvm.mlir.zero : !llvm.ptr
+ // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]])
+ gpu.dealloc %0 : memref<?xf32>
return
}
}
diff --git a/mlir/test/Dialect/GPU/async-region.mlir b/mlir/test/Dialect/GPU/async-region.mlir
index 00832c45de1146..d68d4d27c97922 100644
--- a/mlir/test/Dialect/GPU/async-region.mlir
+++ b/mlir/test/Dialect/GPU/async-region.mlir
@@ -189,4 +189,13 @@ module attributes {gpu.container_module} {
gpu.wait
return
}
+
+ // CHECK-LABEL:func @alloc_host_shared()
+ func.func @alloc_host_shared() {
+ // CHECK: %[[m:.*]] = gpu.alloc host_shared () : memref<7xf32>
+ %0 = gpu.alloc host_shared() : memref<7xf32>
+ // CHECK: gpu.dealloc %[[m]] : memref<7xf32>
+ gpu.dealloc %0 : memref<7xf32>
+ return
+ }
}
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir
index c0e2903aee2d12..6d01ffba29c347 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_sycl_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
@@ -25,16 +25,12 @@ module @add attributes {gpu.container_module} {
%memref_0 = gpu.alloc host_shared () : memref<2x2x2xf32>
memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32>
%memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32>
- %2 = gpu.wait async
- %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
- gpu.wait [%3]
+ gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
%alloc = memref.alloc() : memref<2x2x2xf32>
memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32>
- %4 = gpu.wait async
- %5 = gpu.dealloc async [%4] %memref_2 : memref<2x2x2xf32>
- %6 = gpu.dealloc async [%5] %memref_0 : memref<2x2x2xf32>
- %7 = gpu.dealloc async [%6] %mem : memref<2x2x2xf32>
- gpu.wait [%7]
+ gpu.dealloc %memref_2 : memref<2x2x2xf32>
+ gpu.dealloc %memref_0 : memref<2x2x2xf32>
+ gpu.dealloc %mem : memref<2x2x2xf32>
return %alloc : memref<2x2x2xf32>
}
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir
index 4ac1533b75d203..b9c740971ccd2b 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_sycl_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
@@ -25,16 +25,12 @@ module @add attributes {gpu.container_module} {
%memref_0 = gpu.alloc host_shared () : memref<3x3xi64>
memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
%memref_2 = gpu.alloc host_shared () : memref<3x3xi64>
- %2 = gpu.wait async
- %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
- gpu.wait [%3]
+ gpu.launch_func @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
%alloc = memref.alloc() : memref<3x3xi64>
memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
- %4 = gpu.wait async
- %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
- %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
- %7 = gpu.dealloc async [%6] %mem : memref<3x3xi64>
- gpu.wait [%7]
+ gpu.dealloc %memref_2 : memref<3x3xi64>
+ gpu.dealloc %memref_0 : memref<3x3xi64>
+ gpu.dealloc %mem : memref<3x3xi64>
return %alloc : memref<3x3xi64>
}
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir
index 162a793305e972..770980f810c686 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_sycl_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
@@ -40,19 +40,14 @@ module @relu attributes {gpu.container_module} {
%memref = gpu.alloc host_shared () : memref<4x5xf32>
memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32>
%memref_0 = gpu.alloc host_shared () : memref<4x5xi1>
- %2 = gpu.wait async
- %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
- gpu.wait [%3]
+ gpu.launch_func @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
%memref_1 = gpu.alloc host_shared () : memref<4x5xf32>
- %4 = gpu.wait async
- %5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>)
- gpu.wait [%5]
+ gpu.launch_func @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>)
%alloc = memref.alloc() : memref<4x5xf32>
memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32>
- %6 = gpu.wait async
- %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32>
- %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1>
- %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32>
+ gpu.dealloc %memref_1 : memref<4x5xf32>
+ gpu.dealloc %memref_0 : memref<4x5xi1>
+ gpu.dealloc %memref : memref<4x5xf32>
return %alloc : memref<4x5xf32>
}
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
|
host_shared.
gpu.alloc host_shared is not an async operation. Update gpu async region pass to handle this case correctly.
Also gpu.dealloc working on a host_shared memory (if it can be analyzed) is lowered to non async operation.
Currently only GPU to SPIR-V to SYCL runtime is impacted by this change.
SYCL runtime wrapper and GPU to SYCL integration test are updated accordingly.