Skip to content

Commit b145a5f

Browse files
Add CK Tile Tutorials Folder with GEMM and COPY Kernel (#3038)
* feat: add tutorial folder with gemm tutorial * chore: move copy kernel from examples folder to tutorial * Update tutorial/ck_tile/01_naive_gemm/README.md Co-authored-by: Copilot <[email protected]> * Update tutorial/ck_tile/01_naive_gemm/README.md Co-authored-by: Copilot <[email protected]> * chore: remove handdrawn images * docs: add write ups to explain the gemm kernel * docs: add about block level pipeline and static distributed tensors --------- Co-authored-by: Copilot <[email protected]>
1 parent c54ecd9 commit b145a5f

24 files changed

+3288
-16
lines changed

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,12 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY)
683683
PACKAGE_NAME examples
684684
)
685685
add_subdirectory(example)
686+
687+
add_subdirectory(tutorial)
688+
rocm_package_setup_component(tutorials
689+
LIBRARY_NAME composablekernel
690+
PACKAGE_NAME tutorials
691+
)
686692
add_subdirectory(tile_engine)
687693
if(BUILD_TESTING)
688694
add_subdirectory(test)

example/ck_tile/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ add_subdirectory(22_gemm_multi_abd)
2525
add_subdirectory(35_batched_transpose)
2626
add_subdirectory(36_pooling)
2727
add_subdirectory(38_block_scale_gemm)
28-
add_subdirectory(39_copy)
2928
add_subdirectory(40_streamk_gemm)
3029
add_subdirectory(41_batched_contraction)
3130

tutorial/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
include_directories(BEFORE
2+
${PROJECT_SOURCE_DIR}/include
3+
${PROJECT_SOURCE_DIR}/library/include
4+
)
5+
6+
message(STATUS "Building tutorials...")
7+
add_custom_target(tutorials)
8+
9+
# add all tutorial subdir
10+
file(GLOB dir_list LIST_DIRECTORIES true *)
11+
FOREACH(subdir ${dir_list})
12+
if(IS_DIRECTORY "${subdir}" AND EXISTS "${subdir}/CMakeLists.txt")
13+
add_subdirectory(${subdir})
14+
ENDIF()
15+
ENDFOREACH()
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
add_executable(tile_example_copy EXCLUDE_FROM_ALL copy_basic.cpp)
1+
add_executable(tile_tutorial_copy_kernel EXCLUDE_FROM_ALL copy_basic.cpp)
22

33
# Impact: This flag ensures that the compiler doesn't make
44
# assumptions about memory aliasing that could interfere with Composable Kernel's explicit memory access patterns.
5-
target_compile_options(tile_example_copy PRIVATE
5+
target_compile_options(tile_tutorial_copy_kernel PRIVATE
66
-mllvm -enable-noalias-to-md-conversion=0
77
)
8+
9+
add_dependencies(tutorials tile_tutorial_copy_kernel)
File renamed without changes.

example/ck_tile/39_copy/copy_basic.cpp renamed to tutorial/ck_tile/00_copy_kernel/copy_basic.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
5454
x_buf.ToDevice(x_host.data());
5555

5656
// Define tile configuration
57-
using ThreadTile = ck_tile::sequence<1, 4>; // per-thread tile size along M and N
58-
using WaveTile = ck_tile::sequence<64, 4>; // wave size along M and N dimension
59-
using BlockWaves = ck_tile::sequence<4, 1>; // number of waves along M dimension
60-
using BlockTile = ck_tile::sequence<512, 4>; // block size along M and N dimension
57+
using ThreadTile = ck_tile::sequence<1, 4>; // per-thread tile size along M and N
58+
using WaveTile = ck_tile::sequence<64, 4>; // per-wave tile size along M and N dimension
59+
using BlockWaves = ck_tile::sequence<4, 1>; // number of waves per block along M and N dimension
60+
using BlockTile = ck_tile::sequence<512, 4>; // per-block tile size along M and N dimension
6161

6262
// Calculate grid size
6363
ck_tile::index_t kGridSize =
@@ -68,14 +68,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
6868
using Shape = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, ThreadTile>;
6969
using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
7070
using Policy = ck_tile::TileCopyPolicy<Problem>;
71-
using Kernel = ck_tile::ElementWiseTileCopyKernel<Problem, Policy>;
72-
// using Kernel = ck_tile::TileCopyKernel<Problem, Policy>;
73-
// using Kernel = ck_tile::TileCopyKernel_LDS<Problem, Policy>;
74-
75-
// question: Why do we not have a pipeline?
76-
// answer: For basic copy operation, pipeline is not needed.
77-
// we intentionally do not use pipeline for this example and let the kernel be composite of
78-
// Problem and Policy
71+
using Kernel = ck_tile::ElementWiseTileCopyKernel<Problem, Policy>; // operates on element by
72+
// element basis.
73+
74+
// We also implement two variations of the copy kernel:
75+
// 1. TileCopyKernel: This is the basic copy kernel that operates on tile by tile basis.
76+
// 2. TileCopyKernel_LDS: This is the copy kernel that operates on tile by tile basis and uses
77+
// the LDS. using Kernel = ck_tile::TileCopyKernel<Problem, Policy>; using Kernel =
78+
// ck_tile::TileCopyKernel_LDS<Problem, Policy>;
7979

8080
auto blockSize = Kernel::BlockSize();
8181

example/ck_tile/39_copy/test_tile_example.sh renamed to tutorial/ck_tile/00_copy_kernel/test_tile_example.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
set -euo pipefail
66

7-
BIN="${BIN:-../../../build/bin/tile_example_copy}"
7+
BIN="${BIN:-../../../build/bin/tile_tutorial_copy_kernel}"
88
WARMUP="${WARMUP:-20}"
99
REPEAT="${REPEAT:-100}"
1010
VALIDATE="${VALIDATE:-1}"

0 commit comments

Comments
 (0)