only test trivial block size

Dao-AILab · skrider · Feb 8, 2024 · Feb 9, 2024 · Feb 11, 2024 · Feb 11, 2024
commit 409431b812bf5ef85096232552735a8ae5b2f87c
diff --git a/csrc/cutlass b/csrc/cutlass
diff --git a/tests/test_flash_attn.py b/tests/test_flash_attn.py
@@ -1832,11 +1832,10 @@ def test_flash_attn_splitkv(
 # @pytest.mark.parametrize("rotary_interleaved", [False])
 @pytest.mark.parametrize("rotary_fraction", [0.0, 0.5, 1.0])
 # @pytest.mark.parametrize("rotary_fraction", [0.0])
-@pytest.mark.parametrize("paged_kv_block_size", [None, 256])
-# @pytest.mark.parametrize("paged_kv_block_size", [256, 512])
-# @pytest.mark.parametrize("paged_kv_block_size", [256])
-@pytest.mark.parametrize("has_batch_idx", [False, True])
-# @pytest.mark.parametrize("has_batch_idx", [False])
+# @pytest.mark.parametrize("paged_kv_block_size", [None, 256, 512])
+@pytest.mark.parametrize("paged_kv_block_size", [256])
+# @pytest.mark.parametrize("has_batch_idx", [False, True])
+@pytest.mark.parametrize("has_batch_idx", [False])
 @pytest.mark.parametrize("d", [32, 59, 64, 80, 128, 256])
 # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
 # @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
+2 −7		CHANGELOG.md
+3 −24		CMakeLists.txt
+0 −7		PUBLICATIONS.md
+4 −9		README.md
+38 −0		cmake/version.h.in
+0 −34		cmake/version_extended.h.in
+0 −1		examples/02_dump_reg_shmem/CMakeLists.txt
+2 −2		examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+7 −7		examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
+8 −10		examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt
+49 −96		examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
+0 −10		examples/57_hopper_grouped_gemm/CMakeLists.txt
+1 −1		include/cute/arch/copy_sm90_desc.hpp
+0 −2		include/cute/atom/mma_atom.hpp
+2 −2		include/cute/util/print.hpp
+0 −3		include/cute/util/type_traits.hpp
+0 −4		include/cutlass/arch/mma_sm90.h
+0 −1		include/cutlass/bfloat16.h
+1 −35		include/cutlass/detail/layout.hpp
+7 −12		include/cutlass/epilogue/collective/builders/sm90_builder.inl
+0 −1		include/cutlass/epilogue/collective/default_epilogue.hpp
+18 −32		include/cutlass/epilogue/collective/default_epilogue_array.hpp
+38 −76		include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
+2 −1		include/cutlass/epilogue/dispatch_policy.hpp
+0 −28		include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
+0 −1		include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
+12 −57		include/cutlass/epilogue/thread/linear_combination.h
+183 −0		include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_row_broadcast.h
+519 −0		include/cutlass/epilogue/threadblock/predicated_tile_iterator_row_broadcast.h
+8 −4		include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+29 −45		include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
+514 −0		include/cutlass/gemm/device/gemm_sparse_row_broadcast.h
+7 −4		include/cutlass/gemm/dispatch_policy.hpp
+0 −12		include/cutlass/gemm/group_array_problem_shape.hpp
+191 −0		include/cutlass/gemm/kernel/default_gemm_sparse_row_broadcast.h
+35 −30		include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
+7 −5		include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
+86 −140		include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
+400 −0		include/cutlass/gemm/kernel/sparse_gemm_row_broadcast.h
+6 −14		include/cutlass/gemm/kernel/tile_scheduler_params.h
+0 −80		include/cutlass/version.h
+2 −2		pyproject.toml
+3 −3		python/cutlass/__init__.py
+2 −6		python/cutlass/backend/c_types.py
+1 −23		python/cutlass/backend/epilogue.py
+2 −2		python/cutlass/backend/evt/frontend/frontend_base.py
+16 −0		python/cutlass/backend/evt/passes/graph_drawer.py
+18 −28		python/cutlass/backend/gemm_operation.py
+1 −1		python/setup_library.py
+1 −1		python/setup_pycute.py
+0 −1		test/unit/gemm/device/CMakeLists.txt
+19 −0		test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
+0 −685		test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_store.cu
+20 −7		test/unit/gemm/device/testbed_sparse.h
+1 −1		tools/util/include/cutlass/util/packed_stride.hpp