Skip to content

Commit 19c4169

Browse files
Feature/occupancy (#648)
* Fix typo: stream._handle -> stream.handle Stream class does not have _handle data member. * Move definition of LaunchConfig class to separate file This is necessary to avoid circular dependency. Cluster-related occupancy functions need LaunchConfig. Occupancy functions are defined in _module.py, and _launcher.py that used to house definition of LaunchConfig imports Kernel from _module.py * Introduce _module.KernelOccupancy class This class defines kernel occupancy query methods. - max_active_blocks_per_multiprocessor - max_potential_block_size - available_dynamic_shared_memory_per_block - max_potential_cluster_size - max_active_clusters Implementation is based on driver API. The following occupancy-related driver functions are not used - `cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` - `cuOccupancyMaxPotentialBlockSizeWithFlags` In `cuOccupancyMaxPotentialBlockSize`, only constant dynamic shared-memory size is supported for now. Supporting variable dynamic shared-memory size that depends on the block size is deferred until design is resolved. * Add occupancy tests, except for cluster-related queries * Fix type in querying handle from Stream argument * Add tests for cluster-related occupancy descriptors * Introduce MaxPotentialBlockSizeOccupancyResult named tuple Use it as return type for the KernelOccupancy.max_potential_block_size output. * KernelOccupancy.max_potential_block_size support for CUoccupancyB2DSize cuda_utils.driver.CUoccupancyB2DSize type is supported. Required size of dynamic shared memory allocation renamed to dynamic_shared_memory_needed * Add test for B2DSize usage in max_potential_block_size Test requires Numba. If numba is absent, it is skipped, otherwise `numba.cfunc` is used to compile Python function. ctypes.CFuncPtr object obtained from cfunc_res.ctypes is converted to CUoccupancyB2DSize. * Improved max_potential_block_size.__doc__ Expanded the docstring, added advisory about possibility of deadlocks should function encoded CUoccupancyB2DSize require GIL. Added argument type validation for dynamic_shared_memory_needed argument. * Add test for dynamic_shared_memory_needed arg of invalid type * Mention feature/occupancy in 0.3.0 release notes * Add symbols to api_private.rst * Reduce test name verbosity Occupancy tests need not contain saxpy in the test name even though it uses saxpy kernel for testing. * Add doc-strings to KernelOccupancy methods. * fix rendering
1 parent 064b9ea commit 19c4169

File tree

7 files changed

+407
-70
lines changed

7 files changed

+407
-70
lines changed

cuda_core/cuda/core/experimental/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from cuda.core.experimental import utils
66
from cuda.core.experimental._device import Device
77
from cuda.core.experimental._event import Event, EventOptions
8-
from cuda.core.experimental._launcher import LaunchConfig, launch
8+
from cuda.core.experimental._launch_config import LaunchConfig
9+
from cuda.core.experimental._launcher import launch
910
from cuda.core.experimental._linker import Linker, LinkerOptions
1011
from cuda.core.experimental._module import ObjectCode
1112
from cuda.core.experimental._program import Program, ProgramOptions
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from dataclasses import dataclass
6+
from typing import Optional, Union
7+
8+
from cuda.core.experimental._device import Device
9+
from cuda.core.experimental._utils.cuda_utils import (
10+
CUDAError,
11+
cast_to_3_tuple,
12+
driver,
13+
get_binding_version,
14+
handle_return,
15+
)
16+
17+
# TODO: revisit this treatment for py313t builds
18+
_inited = False
19+
20+
21+
def _lazy_init():
22+
global _inited
23+
if _inited:
24+
return
25+
26+
global _use_ex
27+
# binding availability depends on cuda-python version
28+
_py_major_minor = get_binding_version()
29+
_driver_ver = handle_return(driver.cuDriverGetVersion())
30+
_use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
31+
_inited = True
32+
33+
34+
@dataclass
35+
class LaunchConfig:
36+
"""Customizable launch options.
37+
38+
Attributes
39+
----------
40+
grid : Union[tuple, int]
41+
Collection of threads that will execute a kernel function.
42+
cluster : Union[tuple, int]
43+
Group of blocks (Thread Block Cluster) that will execute on the same
44+
GPU Processing Cluster (GPC). Blocks within a cluster have access to
45+
distributed shared memory and can be explicitly synchronized.
46+
block : Union[tuple, int]
47+
Group of threads (Thread Block) that will execute on the same
48+
streaming multiprocessor (SM). Threads within a thread blocks have
49+
access to shared memory and can be explicitly synchronized.
50+
shmem_size : int, optional
51+
Dynamic shared-memory size per thread block in bytes.
52+
(Default to size 0)
53+
54+
"""
55+
56+
# TODO: expand LaunchConfig to include other attributes
57+
grid: Union[tuple, int] = None
58+
cluster: Union[tuple, int] = None
59+
block: Union[tuple, int] = None
60+
shmem_size: Optional[int] = None
61+
62+
def __post_init__(self):
63+
_lazy_init()
64+
self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
65+
self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
66+
# thread block clusters are supported starting H100
67+
if self.cluster is not None:
68+
if not _use_ex:
69+
err, drvers = driver.cuDriverGetVersion()
70+
drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
71+
raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")
72+
cc = Device().compute_capability
73+
if cc < (9, 0):
74+
raise CUDAError(
75+
f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
76+
)
77+
self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
78+
if self.shmem_size is None:
79+
self.shmem_size = 0
80+
81+
82+
def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
83+
_lazy_init()
84+
drv_cfg = driver.CUlaunchConfig()
85+
drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
86+
drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
87+
drv_cfg.sharedMemBytes = config.shmem_size
88+
attrs = [] # TODO: support more attributes
89+
if config.cluster:
90+
attr = driver.CUlaunchAttribute()
91+
attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
92+
dim = attr.value.clusterDim
93+
dim.x, dim.y, dim.z = config.cluster
94+
attrs.append(attr)
95+
drv_cfg.numAttrs = len(attrs)
96+
drv_cfg.attrs = attrs
97+
return drv_cfg

cuda_core/cuda/core/experimental/_launcher.py

Lines changed: 4 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,13 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5-
from dataclasses import dataclass
6-
from typing import Optional, Union
75

8-
from cuda.core.experimental._device import Device
96
from cuda.core.experimental._kernel_arg_handler import ParamHolder
7+
from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
108
from cuda.core.experimental._module import Kernel
119
from cuda.core.experimental._stream import Stream
1210
from cuda.core.experimental._utils.clear_error_support import assert_type
1311
from cuda.core.experimental._utils.cuda_utils import (
14-
CUDAError,
15-
cast_to_3_tuple,
1612
check_or_create_options,
1713
driver,
1814
get_binding_version,
@@ -37,54 +33,6 @@ def _lazy_init():
3733
_inited = True
3834

3935

40-
@dataclass
41-
class LaunchConfig:
42-
"""Customizable launch options.
43-
44-
Attributes
45-
----------
46-
grid : Union[tuple, int]
47-
Collection of threads that will execute a kernel function.
48-
cluster : Union[tuple, int]
49-
Group of blocks (Thread Block Cluster) that will execute on the same
50-
GPU Processing Cluster (GPC). Blocks within a cluster have access to
51-
distributed shared memory and can be explicitly synchronized.
52-
block : Union[tuple, int]
53-
Group of threads (Thread Block) that will execute on the same
54-
streaming multiprocessor (SM). Threads within a thread blocks have
55-
access to shared memory and can be explicitly synchronized.
56-
shmem_size : int, optional
57-
Dynamic shared-memory size per thread block in bytes.
58-
(Default to size 0)
59-
60-
"""
61-
62-
# TODO: expand LaunchConfig to include other attributes
63-
grid: Union[tuple, int] = None
64-
cluster: Union[tuple, int] = None
65-
block: Union[tuple, int] = None
66-
shmem_size: Optional[int] = None
67-
68-
def __post_init__(self):
69-
_lazy_init()
70-
self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
71-
self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
72-
# thread block clusters are supported starting H100
73-
if self.cluster is not None:
74-
if not _use_ex:
75-
err, drvers = driver.cuDriverGetVersion()
76-
drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
77-
raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")
78-
cc = Device().compute_capability
79-
if cc < (9, 0):
80-
raise CUDAError(
81-
f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
82-
)
83-
self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
84-
if self.shmem_size is None:
85-
self.shmem_size = 0
86-
87-
8836
def launch(stream, config, kernel, *kernel_args):
8937
"""Launches a :obj:`~_module.Kernel`
9038
object with launch-time configuration.
@@ -114,6 +62,7 @@ def launch(stream, config, kernel, *kernel_args):
11462
f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
11563
) from e
11664
assert_type(kernel, Kernel)
65+
_lazy_init()
11766
config = check_or_create_options(LaunchConfig, config, "launch config")
11867

11968
# TODO: can we ensure kernel_args is valid/safe to use here?
@@ -127,25 +76,13 @@ def launch(stream, config, kernel, *kernel_args):
12776
# mainly to see if the "Ex" API is available and if so we use it, as it's more feature
12877
# rich.
12978
if _use_ex:
130-
drv_cfg = driver.CUlaunchConfig()
131-
drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
132-
drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
79+
drv_cfg = _to_native_launch_config(config)
13380
drv_cfg.hStream = stream.handle
134-
drv_cfg.sharedMemBytes = config.shmem_size
135-
attrs = [] # TODO: support more attributes
136-
if config.cluster:
137-
attr = driver.CUlaunchAttribute()
138-
attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
139-
dim = attr.value.clusterDim
140-
dim.x, dim.y, dim.z = config.cluster
141-
attrs.append(attr)
142-
drv_cfg.numAttrs = len(attrs)
143-
drv_cfg.attrs = attrs
14481
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
14582
else:
14683
# TODO: check if config has any unsupported attrs
14784
handle_return(
14885
driver.cuLaunchKernel(
149-
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream._handle, args_ptr, 0
86+
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
15087
)
15188
)

0 commit comments

Comments
 (0)