Skip to content

Support cooperative launch #676

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cuda_core/cuda/core/experimental/_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,17 @@ def can_use_host_pointer_for_registered_mem(self) -> bool:
)
)

# TODO: A few attrs are missing here (NVIDIA/cuda-python#675)

@property
def cooperative_launch(self) -> bool:
"""
True if device supports launching cooperative kernels, False if not.
"""
return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH))

# TODO: A few attrs are missing here (NVIDIA/cuda-python#675)

@property
def max_shared_memory_per_block_optin(self) -> int:
"""
Expand Down
11 changes: 11 additions & 0 deletions cuda_core/cuda/core/experimental/_launch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,15 @@ class LaunchConfig:
cluster: Union[tuple, int] = None
block: Union[tuple, int] = None
shmem_size: Optional[int] = None
cooperative_launch: Optional[bool] = False

def __post_init__(self):
_lazy_init()
self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
# FIXME: Calling Device() strictly speaking is not quite right; we should instead
# look up the device from stream. We probably need to defer the checks related to
# device compute capability or attributes.
# thread block clusters are supported starting H100
if self.cluster is not None:
if not _use_ex:
Expand All @@ -77,6 +81,8 @@ def __post_init__(self):
self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
if self.shmem_size is None:
self.shmem_size = 0
if self.cooperative_launch and not Device().properties.cooperative_launch:
raise CUDAError("cooperative kernels are not supported on this device")


def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
Expand All @@ -92,6 +98,11 @@ def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
dim = attr.value.clusterDim
dim.x, dim.y, dim.z = config.cluster
attrs.append(attr)
if config.cooperative_launch:
attr = driver.CUlaunchAttribute()
attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
attr.value.cooperative = 1
attrs.append(attr)
drv_cfg.numAttrs = len(attrs)
drv_cfg.attrs = attrs
return drv_cfg
16 changes: 16 additions & 0 deletions cuda_core/cuda/core/experimental/_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from cuda.core.experimental._stream import Stream
from cuda.core.experimental._utils.clear_error_support import assert_type
from cuda.core.experimental._utils.cuda_utils import (
_reduce_3_tuple,
check_or_create_options,
driver,
get_binding_version,
Expand Down Expand Up @@ -78,6 +79,8 @@ def launch(stream, config, kernel, *kernel_args):
if _use_ex:
drv_cfg = _to_native_launch_config(config)
drv_cfg.hStream = stream.handle
if config.cooperative_launch:
_check_cooperative_launch(kernel, config, stream)
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
else:
# TODO: check if config has any unsupported attrs
Expand All @@ -86,3 +89,16 @@ def launch(stream, config, kernel, *kernel_args):
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
)
)


def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
dev = stream.device
num_sm = dev.properties.multiprocessor_count
max_grid_size = (
kernel.occupancy.max_active_blocks_per_multiprocessor(_reduce_3_tuple(config.block), config.shmem_size) * num_sm
)
if _reduce_3_tuple(config.grid) > max_grid_size:
# For now let's try not to be smart and adjust the grid size behind users' back.
# We explicitly ask users to adjust.
x, y, z = config.grid
raise ValueError(f"The specified grid size ({x} * {y} * {z}) exceeds the limit ({max_grid_size})")
4 changes: 4 additions & 0 deletions cuda_core/cuda/core/experimental/_utils/cuda_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ def cast_to_3_tuple(label, cfg):
return cfg + (1,) * (3 - len(cfg))


def _reduce_3_tuple(t: tuple):
return t[0] * t[1] * t[2]


def _check_driver_error(error):
if error == driver.CUresult.CUDA_SUCCESS:
return
Expand Down
3 changes: 2 additions & 1 deletion cuda_core/docs/source/release/0.3.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ New features

- :class:`Kernel` adds :property:`Kernel.num_arguments` and :property:`Kernel.arguments_info` for introspection of kernel arguments. (#612)
- Add pythonic access to kernel occupancy calculation functions via :property:`Kernel.occupancy`. (#648)
- Support launching cooperative kernels by setting :property:`LaunchConfig.cooperative_launch` to `True`.

New examples
------------
Expand All @@ -31,4 +32,4 @@ Fixes and enhancements
----------------------

- An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively.
- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed
- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed.
4 changes: 4 additions & 0 deletions cuda_core/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,7 @@ def pop_all_contexts():
os.environ.get("CUDA_PYTHON_TESTING_WITH_COMPUTE_SANITIZER", "0") == "1",
reason="The compute-sanitizer is running, and this test causes an API error.",
)


# TODO: make the fixture more sophisticated using path finder
skipif_need_cuda_headers = pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need CUDA header")
1 change: 1 addition & 0 deletions cuda_core/tests/test_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ def test_compute_capability():
("concurrent_managed_access", bool),
("compute_preemption_supported", bool),
("can_use_host_pointer_for_registered_mem", bool),
("cooperative_launch", bool),
("max_shared_memory_per_block_optin", int),
("pageable_memory_access_uses_host_page_tables", bool),
("direct_managed_mem_access_from_host", bool),
Expand Down
5 changes: 2 additions & 3 deletions cuda_core/tests/test_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import numpy as np
import pytest
from conftest import skipif_testing_with_compute_sanitizer
from conftest import skipif_need_cuda_headers, skipif_testing_with_compute_sanitizer

import cuda.core.experimental
from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch
Expand Down Expand Up @@ -114,9 +114,8 @@ def test_error_timing_recorded():
event3 - event2


# TODO: improve this once path finder can find headers
@skipif_testing_with_compute_sanitizer
@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header")
@skipif_need_cuda_headers # libcu++
@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
def test_error_timing_incomplete():
device = Device()
Expand Down
46 changes: 46 additions & 0 deletions cuda_core/tests/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
import pytest
from conftest import skipif_need_cuda_headers

from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
from cuda.core.experimental._memory import _DefaultPinnedMemorySource
Expand Down Expand Up @@ -152,3 +153,48 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value):

# Check result
assert arr[0] == init_value, f"Expected {init_value}, got {arr[0]}"


@skipif_need_cuda_headers # cg
def test_cooperative_launch():
dev = Device()
dev.set_current()
s = dev.create_stream(options={"nonblocking": True})

# CUDA kernel templated on type T
code = r"""
#include <cooperative_groups.h>

extern "C" __global__ void test_grid_sync() {
namespace cg = cooperative_groups;
auto grid = cg::this_grid();
grid.sync();
}
"""

# Compile and force instantiation for this type
arch = "".join(f"{i}" for i in dev.compute_capability)
include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include"))
pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=include_path)
prog = Program(code, code_type="c++", options=pro_opts)
ker = prog.compile("cubin").get_kernel("test_grid_sync")

# # Launch without setting cooperative_launch
# # Commented out as this seems to be a sticky error...
# config = LaunchConfig(grid=1, block=1)
# launch(s, config, ker)
# from cuda.core.experimental._utils.cuda_utils import CUDAError
# with pytest.raises(CUDAError) as e:
# s.sync()
# assert "CUDA_ERROR_LAUNCH_FAILED" in str(e)

# Crazy grid sizes would not work
block = 128
config = LaunchConfig(grid=dev.properties.max_grid_dim_x // block + 1, block=block, cooperative_launch=True)
with pytest.raises(ValueError):
launch(s, config, ker)

# This works just fine
config = LaunchConfig(grid=1, block=1, cooperative_launch=True)
launch(s, config, ker)
s.sync()
Loading