diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 3a4fc7d8063f4..66d9ce7cfc511 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -873,18 +873,33 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::DeallocOp deallocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { + bool isHostShared = false; + auto *op = deallocOp.getOperation(); + for (auto operand : op->getOperands()) { + gpu::AllocOp allocOp = operand.getDefiningOp(); + if (!allocOp) + continue; + if (allocOp.getHostShared()) + isHostShared = true; + } + if (failed(areAllLLVMTypes(deallocOp, adaptor.getOperands(), rewriter)) || - failed(isAsyncWithOneDependency(rewriter, deallocOp))) + (!isHostShared && failed(isAsyncWithOneDependency(rewriter, deallocOp)))) return failure(); Location loc = deallocOp.getLoc(); Value pointer = MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc); - Value stream = adaptor.getAsyncDependencies().front(); + Value stream = isHostShared + ? rewriter.create(loc, llvmPointerType) + : adaptor.getAsyncDependencies().front(); deallocCallBuilder.create(loc, rewriter, {pointer, stream}); - rewriter.replaceOp(deallocOp, {stream}); + if (isHostShared) + rewriter.eraseOp(deallocOp); + else + rewriter.replaceOp(deallocOp, {stream}); return success(); } diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp index b2fa3a99c53fc..700a22822c5ab 100644 --- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp @@ -73,6 +73,17 @@ struct GpuAsyncRegionPass::ThreadTokenCallback { LogicalResult visit(Operation *op) { if (isa(op)) return op->emitOpError("replace with gpu.launch_func first"); + // gpu.alloc() host_shared cannot be done async + if (isa(op)) { + gpu::AllocOp allocOp = dyn_cast(op); + if (allocOp.getHostShared()) + return success(); + } + if (isa(op)) { + gpu::AllocOp allocOp = op->getOperand(0).getDefiningOp(); + if (allocOp.getHostShared()) + return success(); + } if (auto waitOp = llvm::dyn_cast(op)) { if (currentToken) waitOp.addAsyncDependency(currentToken); diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp index c250340c38fc7..6653850fa08bb 100644 --- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp @@ -90,7 +90,13 @@ static void *allocDeviceMemory(sycl::queue *queue, size_t size, bool isShared) { } static void deallocDeviceMemory(sycl::queue *queue, void *ptr) { - sycl::free(ptr, *queue); + if (queue == nullptr) { + queue = new sycl::queue(getDefaultContext(), getDefaultDevice()); + sycl::free(ptr, *queue); + delete queue; + } else { + sycl::free(ptr, *queue); + } } static ze_module_handle_t loadModule(const void *data, size_t dataSize) { diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir index ae8b7aaac7fd9..74661a0b89b1d 100644 --- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir @@ -29,12 +29,10 @@ module attributes {gpu.container_module} { // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[nullptr]], %[[isHostShared]]) %0 = gpu.alloc host_shared (%size) : memref - // CHECK: %[[stream:.*]] = llvm.call @mgpuStreamCreate() - %1 = gpu.wait async - %2 = gpu.dealloc async [%1] %0 : memref - // CHECK: llvm.call @mgpuStreamSynchronize(%[[stream]]) - // CHECK: llvm.call @mgpuStreamDestroy(%[[stream]]) - gpu.wait [%2] + // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] + // CHECK: %[[stream:.*]] = llvm.mlir.zero : !llvm.ptr + // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]]) + gpu.dealloc %0 : memref return } } diff --git a/mlir/test/Dialect/GPU/async-region.mlir b/mlir/test/Dialect/GPU/async-region.mlir index 00832c45de114..d68d4d27c9792 100644 --- a/mlir/test/Dialect/GPU/async-region.mlir +++ b/mlir/test/Dialect/GPU/async-region.mlir @@ -189,4 +189,13 @@ module attributes {gpu.container_module} { gpu.wait return } + + // CHECK-LABEL:func @alloc_host_shared() + func.func @alloc_host_shared() { + // CHECK: %[[m:.*]] = gpu.alloc host_shared () : memref<7xf32> + %0 = gpu.alloc host_shared() : memref<7xf32> + // CHECK: gpu.dealloc %[[m]] : memref<7xf32> + gpu.dealloc %0 : memref<7xf32> + return + } } diff --git a/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir index c0e2903aee2d1..6d01ffba29c34 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_sycl_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -25,16 +25,12 @@ module @add attributes {gpu.container_module} { %memref_0 = gpu.alloc host_shared () : memref<2x2x2xf32> memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32> %memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32> - %2 = gpu.wait async - %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>) - gpu.wait [%3] + gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>) %alloc = memref.alloc() : memref<2x2x2xf32> memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32> - %4 = gpu.wait async - %5 = gpu.dealloc async [%4] %memref_2 : memref<2x2x2xf32> - %6 = gpu.dealloc async [%5] %memref_0 : memref<2x2x2xf32> - %7 = gpu.dealloc async [%6] %mem : memref<2x2x2xf32> - gpu.wait [%7] + gpu.dealloc %memref_2 : memref<2x2x2xf32> + gpu.dealloc %memref_0 : memref<2x2x2xf32> + gpu.dealloc %mem : memref<2x2x2xf32> return %alloc : memref<2x2x2xf32> } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { diff --git a/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir index 4ac1533b75d20..b9c740971ccd2 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_sycl_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -25,16 +25,12 @@ module @add attributes {gpu.container_module} { %memref_0 = gpu.alloc host_shared () : memref<3x3xi64> memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64> %memref_2 = gpu.alloc host_shared () : memref<3x3xi64> - %2 = gpu.wait async - %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>) - gpu.wait [%3] + gpu.launch_func @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>) %alloc = memref.alloc() : memref<3x3xi64> memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64> - %4 = gpu.wait async - %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64> - %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64> - %7 = gpu.dealloc async [%6] %mem : memref<3x3xi64> - gpu.wait [%7] + gpu.dealloc %memref_2 : memref<3x3xi64> + gpu.dealloc %memref_0 : memref<3x3xi64> + gpu.dealloc %mem : memref<3x3xi64> return %alloc : memref<3x3xi64> } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { diff --git a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir index 162a793305e97..770980f810c68 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_sycl_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -40,19 +40,14 @@ module @relu attributes {gpu.container_module} { %memref = gpu.alloc host_shared () : memref<4x5xf32> memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32> %memref_0 = gpu.alloc host_shared () : memref<4x5xi1> - %2 = gpu.wait async - %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>) - gpu.wait [%3] + gpu.launch_func @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>) %memref_1 = gpu.alloc host_shared () : memref<4x5xf32> - %4 = gpu.wait async - %5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>) - gpu.wait [%5] + gpu.launch_func @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>) %alloc = memref.alloc() : memref<4x5xf32> memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32> - %6 = gpu.wait async - %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32> - %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1> - %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32> + gpu.dealloc %memref_1 : memref<4x5xf32> + gpu.dealloc %memref_0 : memref<4x5xi1> + gpu.dealloc %memref : memref<4x5xf32> return %alloc : memref<4x5xf32> } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} {