-
Notifications
You must be signed in to change notification settings - Fork 64
Closed
Labels
Description
Repro on B200 machine:
$ CUDA_LAUNCH_BLOCKING=1 HELION_AUTOTUNE_RANDOM_SEED=3509511252 python benchmarks/run.py --op gemm --metrics speedup,accuracy --latency-measure-mode triton_do_bench --cudagraph --only helion --only-match-mode prefix-with-baseline --baseline aten_matmul --input-id 0 --num-inputs 1 --input-sample-mode first-k
Error:
Applying custom args for gemm: {}
Running gemm benchmark with Helion implementation...
First-k mode: Selected 1 sequential inputs starting from index 0 (total available: 31)
Input IDs to run: [0]
0%| | 0/1 [00:00<?, ?it/s]Removed 1 outliers from 10 samples
Input tensor metadata:
{ 'args': ( { 'device': 'cuda:0',
'dtype': 'torch.float16',
'shape': (256, 256),
'stride': (256, 1)},
{ 'device': 'cuda:0',
'dtype': 'torch.float16',
'shape': (256, 256),
'stride': (1, 256)},
None),
'kwargs': {}}
[0s] Autotune random seed: 3509511252
[0s] Starting autotuning process, this may take a while...
[0s] Starting PatternSearch with initial_population=100, copies=5, max_generations=20
0%| | 0/1 [00:10<?, ?it/s]
Caught exception on backend helion_matmul_tritonbench, terminating early with partial results
Traceback (most recent call last):
File "/home/willfeng/local/helion/helion/autotuner/base_search.py", line 213, in benchmark_function
output = fn(*self.args) # make sure the kernel is compiled
^^^^^^^^^^^^^^
File "/tmp/torchinductor_willfeng/si/csihsbr3gutkycsbksexjg3yj6yrqfjpyfuyfejxqflyd7noxxrj.py", line 56, in matmul
_launcher(_helion_matmul, (_NUM_SM,), x, y, out, _NUM_SM, _BLOCK_SIZE_1, _BLOCK_SIZE_0, _BLOCK_SIZE_2, num_warps=4, num_stages=6)
File "/home/willfeng/local/helion/helion/runtime/__init__.py", line 66, in default_launcher
return triton_kernel.run(
^^^^^^^^^^^^^^^^^^
File "/home/willfeng/local/pytorch-nightly/triton/runtime/jit.py", line 757, in run
kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
File "/home/willfeng/local/pytorch-nightly/triton/backends/nvidia/driver.py", line 712, in __call__
self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,
RuntimeError: Triton Error [CUDA]: misaligned address
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/willfeng/local/helion/benchmarks/tritonbench/tritonbench/utils/triton_op.py", line 1115, in run
y_vals: Dict[str, BenchmarkOperatorMetrics] = functools.reduce(
^^^^^^^^^^^^^^^^^
File "/home/willfeng/local/helion/benchmarks/tritonbench/tritonbench/utils/triton_op.py", line 1098, in _reduce_benchmarks
acc[bm_name] = self._do_bench(
^^^^^^^^^^^^^^^
File "/home/willfeng/local/helion/benchmarks/tritonbench/tritonbench/utils/triton_op.py", line 1582, in _do_bench
metrics.latency = do_bench_wrapper(
^^^^^^^^^^^^^^^^^
File "/home/willfeng/local/helion/benchmarks/tritonbench/tritonbench/components/do_bench/run.py", line 492, in do_bench_wrapper
raise e
File "/home/willfeng/local/helion/benchmarks/tritonbench/tritonbench/components/do_bench/run.py", line 465, in do_bench_wrapper
times=bench_fn(
^^^^^^^^^
File "/home/willfeng/local/helion/benchmarks/tritonbench/tritonbench/components/do_bench/run.py", line 179, in _do_bench_cudagraph_with_cache_clear
fn()
File "/home/willfeng/local/helion/examples/matmul.py", line 149, in <lambda>
return lambda: matmul(a, b)
^^^^^^^^^^^^
File "/home/willfeng/local/helion/helion/runtime/kernel.py", line 292, in __call__
return self.bind(args)(*args)
^^^^^^^^^^^^^^^^^^^^^^
File "/home/willfeng/local/helion/helion/runtime/kernel.py", line 626, in __call__
self.autotune(args)
File "/home/willfeng/local/helion/helion/runtime/kernel.py", line 511, in autotune
config = self.settings.autotuner_fn(self, args, **kwargs).autotune()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/willfeng/local/helion/helion/autotuner/base_cache.py", line 178, in autotune
config = self.autotuner.autotune()
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/willfeng/local/helion/helion/autotuner/base_search.py", line 363, in autotune
best = self._autotune()
^^^^^^^^^^^^^^^^
File "/home/willfeng/local/helion/helion/autotuner/pattern_search.py", line 63, in _autotune
self.parallel_benchmark_population(self.population, desc="Initial population")
File "/home/willfeng/local/helion/helion/autotuner/base_search.py", line 517, in parallel_benchmark_population
self.parallel_benchmark([m.config for m in members], desc=desc),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/willfeng/local/helion/helion/autotuner/base_search.py", line 345, in parallel_benchmark
results.append((config, fn, self.benchmark_function(config, fn)))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/willfeng/local/helion/helion/autotuner/base_search.py", line 239, in benchmark_function
raise exc.TritonError(
helion.exc.TritonError: Error running generated Triton program:
@helion.kernel(config=helion.Config(block_sizes=[64, 16, 16], indexing='block_ptr', l2_groupings=[8], load_eviction_policies=['', 'last'], loop_orders=[[1, 0]], num_stages=6, num_warps=4,
pid_type='persistent_blocked', range_flattens=[False, None], range_multi_buffers=[True, None], range_num_stages=[4, 2], range_unroll_factors=[4, 4]), static_shapes=True)
RuntimeError: Triton Error [CUDA]: misaligned address
Generated Triton code:
from __future__ import annotations
import torch
import helion
import triton
import triton.language as tl
from helion.runtime import default_launcher as _default_launcher
@triton.jit
def _helion_matmul(x, y, out, _NUM_SM: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
total_pids = tl.cdiv(256, _BLOCK_SIZE_1) * tl.cdiv(256, _BLOCK_SIZE_0)
block_size = tl.cdiv(total_pids, _NUM_SM)
start_pid = tl.program_id(0) * block_size
end_pid = tl.minimum(start_pid + block_size, total_pids)
for virtual_pid in tl.range(start_pid, end_pid, loop_unroll_factor=4, num_stages=4, disallow_acc_multi_buffer=False, flatten=False):
num_pid_m = tl.cdiv(256, _BLOCK_SIZE_1)
num_pid_n = tl.cdiv(256, _BLOCK_SIZE_0)
inner_2d_pid = virtual_pid
num_pid_in_group = 8 * num_pid_n
group_id = inner_2d_pid // num_pid_in_group
first_pid_m = group_id * 8
group_size_m = min(num_pid_m - first_pid_m, 8)
pid_0 = first_pid_m + inner_2d_pid % num_pid_in_group % group_size_m
pid_1 = inner_2d_pid % num_pid_in_group // group_size_m
offset_1 = pid_0 * _BLOCK_SIZE_1
offset_0 = pid_1 * _BLOCK_SIZE_0
acc = tl.full([_BLOCK_SIZE_0, _BLOCK_SIZE_1], 0.0, tl.float32)
for offset_2 in tl.range(0, 256, _BLOCK_SIZE_2, loop_unroll_factor=4, num_stages=2):
acc_copy = acc
acc_copy_0 = acc_copy
load = tl.load(tl.make_block_ptr(x, [256, 256], [256, 1], [offset_0, offset_2], [_BLOCK_SIZE_0, _BLOCK_SIZE_2], [1, 0]), boundary_check=[0, 1], padding_option='zero')
load_1 = tl.load(tl.make_block_ptr(y, [256, 256], [1, 256], [offset_2, offset_1], [_BLOCK_SIZE_2, _BLOCK_SIZE_1], [0, 1]), boundary_check=[0, 1], padding_option='zero',
eviction_policy='evict_last')
acc = tl.dot(tl.cast(load, tl.float16), tl.cast(load_1, tl.float16), acc=acc_copy_0, input_precision='tf32', out_dtype=tl.float32)
v_0 = tl.cast(acc, tl.float16)
tl.store(tl.make_block_ptr(out, [256, 256], [256, 1], [offset_0, offset_1], [_BLOCK_SIZE_0, _BLOCK_SIZE_1], [1, 0]), v_0, boundary_check=[0, 1])
def matmul(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, tuple[Tensor, ...]], Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
"""
Performs matrix multiplication of x and y with an optional epilogue function.
Args:
x (Tensor): Left matrix of shape [m, k].
y (Tensor): Right matrix of shape [k, n].
epilogue (Callable, optional): Function applied to the accumulator and tile indices
after the matmul. Defaults to identity (no change).
Returns:
Tensor: Resulting matrix of shape [m, n].
"""
m, k = x.size()
k2, n = y.size()
assert k == k2, f'size mismatch {k} != {k2}'
out = torch.empty([m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
_NUM_SM = helion.runtime.get_num_sm(x.device)
_BLOCK_SIZE_1 = 16
_BLOCK_SIZE_0 = 64
_BLOCK_SIZE_2 = 16
_launcher(_helion_matmul, (_NUM_SM,), x, y, out, _NUM_SM, _BLOCK_SIZE_1, _BLOCK_SIZE_0, _BLOCK_SIZE_2, num_warps=4, num_stages=6)
return out
Failing input: --input-id 0 --num-inputs 1 --input-sample-mode first-k
(M, N, K)
-----------
[tritonbench] Output result csv to /tmp/tmpw0epc5yi.csv
Initial population 25% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 25/100 18.4 configs/s