ROCm
diff --git a/‎Jenkinsfile‎
Lines changed: 6 additions & 16 deletions b/‎Jenkinsfile‎
Lines changed: 6 additions & 16 deletions
diff --git a/‎tile_engine/ops/gemm/commons/test_benchmark.sh‎ renamed to ‎tile_engine/ops/commons/test_benchmark.sh‎ b/‎tile_engine/ops/gemm/commons/test_benchmark.sh‎ renamed to ‎tile_engine/ops/commons/test_benchmark.sh‎
diff --git a/‎tile_engine/ops/gemm/commons/test_validation.py‎ renamed to ‎tile_engine/ops/commons/test_validation.py‎ b/‎tile_engine/ops/gemm/commons/test_validation.py‎ renamed to ‎tile_engine/ops/commons/test_validation.py‎
diff --git a/‎tile_engine/ops/gemm/commons/validation_utils.py‎ renamed to ‎tile_engine/ops/commons/validation_utils.py‎
Lines changed: 23 additions & 36 deletions b/‎tile_engine/ops/gemm/commons/validation_utils.py‎ renamed to ‎tile_engine/ops/commons/validation_utils.py‎
Lines changed: 23 additions & 36 deletions
diff --git a/‎tile_engine/ops/gemm/CMakeLists.txt‎
Lines changed: 19 additions & 19 deletions b/‎tile_engine/ops/gemm/CMakeLists.txt‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎tile_engine/ops/gemm/gemm_instance_builder.py‎
Lines changed: 41 additions & 10 deletions b/‎tile_engine/ops/gemm/gemm_instance_builder.py‎
Lines changed: 41 additions & 10 deletions
@@ -1642,14 +1642,9 @@ pipeline {
                                            ninja -j64 benchmark_gemm_preshuffle_all && \
                                            python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" \
                                            --warmup 5 --repeat 5 --verbose --json results.json && \
-                                           ninja -j64 benchmark_gemm_multi_d_fp16_rrrr && \
-                                           ./bin/benchmark_gemm_multi_d_fp16_rrrr && \
-                                           ninja -j64 benchmark_gemm_multi_d_fp16_ccrr && \
-                                           ./bin/benchmark_gemm_multi_d_fp16_ccrr && \
-                                           ninja -j64 benchmark_gemm_multi_d_fp16_crrr && \
-                                           ./bin/benchmark_gemm_multi_d_fp16_crrr && \
-                                           ninja -j64 benchmark_gemm_multi_d_fp16_rcrr && \
-                                           ./bin/benchmark_gemm_multi_d_fp16_rcrr """
+                                           ninja -j64 benchmark_gemm_multi_d_all && \
+                                           python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1682,14 +1677,9 @@ pipeline {
                                            ninja -j64 benchmark_gemm_preshuffle_all && \
                                            python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" \
                                            --warmup 5 --repeat 5 --verbose --json results.json && \
-                                           ninja -j64 benchmark_gemm_multi_d_fp16_rrrr && \
-                                           ./bin/benchmark_gemm_multi_d_fp16_rrrr && \
-                                           ninja -j64 benchmark_gemm_multi_d_fp16_ccrr && \
-                                           ./bin/benchmark_gemm_multi_d_fp16_ccrr && \
-                                           ninja -j64 benchmark_gemm_multi_d_fp16_crrr && \
-                                           ./bin/benchmark_gemm_multi_d_fp16_crrr && \
-                                           ninja -j64 benchmark_gemm_multi_d_fp16_rcrr && \
-                                           ./bin/benchmark_gemm_multi_d_fp16_rcrr """
+                                           ninja -j64 benchmark_gemm_multi_d_all && \
+                                           python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
 
@@ -125,38 +125,13 @@
             [32, 32, 64],
         ],
     },
-    "gfx1201": {
+    "gfx1201": {  # Check how to handle for GEMM and Multi D
         "fp16_fp16_fp16": [
             [16, 16, 16],
         ],
     },
 }
 
-# Supported warp tile combinations for different GPU architectures and data types
-WARP_SUPPORTED_COMBINATIONS = {
-    "gfx90a": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx942": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx950": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx1201": [
-        [2, 4, 1],
-        [1, 8, 1],
-        [8, 1, 1],
-        [4, 2, 1],
-    ],
-}
-
 # Unsupported trait combinations
 TRAIT_UNSUPPORTED_COMBINATIONS = {
     ("compv3", "cshuffle", "interwave"),
@@ -441,6 +416,20 @@ def get_abc_layouts(layout_code: str) -> Tuple[str, str, str]:
     return a_layout, b_layout, c_layout
 
 
+def get_abcd_layouts(layout_code: str) -> Tuple[str, str, str, List[str]]:
+    """
+    Return (ALayout, BLayout, CLayout) from a 3-letter code like 'rcrr', 'ccrr', 'crrr', 'rrrr'.
+    """
+    code = str(layout_code).strip().lower()
+
+    a_layout = LAYOUT_MAP[code[0]]
+    b_layout = LAYOUT_MAP[code[1]]
+    c_layout = LAYOUT_MAP[code[2]]
+    d0_layout = LAYOUT_MAP[code[3]]
+    d1_layout = LAYOUT_MAP[code[3]]
+    return a_layout, b_layout, c_layout, [d0_layout, d1_layout]
+
+
 def validate_whole_wg_cover_configuration(
     tile_m,
     tile_n,
@@ -464,13 +453,13 @@ def validate_whole_wg_cover_configuration(
 
     # A matrix validation
     if layout[0] == "r":
-        XPerTile = tile_k
-        YPerTile = tile_m
-
         vector_load_size = get_global_vector_load_size(
             BlockSize, tile_k, a_datatype, tile_m, tile_k
         )
 
+        XPerTile = tile_k
+        YPerTile = tile_m
+
     elif layout[0] == "c":
         vector_load_size = get_global_vector_load_size(
             BlockSize, tile_k, a_datatype, tile_m, tile_m
@@ -485,7 +474,6 @@ def validate_whole_wg_cover_configuration(
         )
 
         if not wg_cover_core_valid:
-            print("I am here 1")
             logging.debug(
                 f"whole workgroup cover failed for Matrix A distribution: {wg_cover_core_error}"
             )
@@ -521,7 +509,7 @@ def validate_whole_wg_cover_configuration(
         if not wg_cover_core_valid:
             print("I am here 3")
             logging.debug(
-                f"whole workgroup cover failed for Matrix A distribution: {wg_cover_core_error}"
+                f"whole workgroup cover failed for Matrix B distribution: {wg_cover_core_error}"
             )
             return False, wg_cover_core_error
 
@@ -540,7 +528,6 @@ def validate_whole_wg_cover_configuration(
         XPerTile, YPerTile, BlockSize, vector_load_size, warp_size
     )
     if not wg_cover_core_valid:
-        print("I am here 4")
         logging.debug(
             f"whole workgroup cover failed for Matrix B: {wg_cover_core_error}"
         )
@@ -557,7 +544,7 @@ def wg_cover_core_validation(
     warp_size: int,
 ) -> Tuple[bool, str]:
     if XPerTile % vector_load_size != 0:
-        return False
+        return False, "XPerTile is not divisible by vector_load_size"
 
     num_warps = BlockSize / warp_size
     LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size)
@@ -567,7 +554,7 @@ def wg_cover_core_validation(
     Y1 = warp_size // X0
 
     if X0 * Y1 != warp_size:
-        return False, ""
+        return False, "X0 * Y1 != warp_size"
 
     return True, ""
 
@@ -583,9 +570,9 @@ def get_global_vector_load_size(
     PackedSize = 1
 
     if (
-        XPerTile % (PackedSize * 32 / element_size(DataType)) == 0
+        PackedSize == 2
+        and XPerTile % (PackedSize * 32 / element_size(DataType)) == 0
         and elements_per_thread % (PackedSize * 32 / element_size(DataType)) == 0
-        and PackedSize == 2
     ):
         return PackedSize * 32 / element_size(DataType)
     elif (
 
@@ -122,15 +122,15 @@ function(build_individual_gemm_targets datatype layout)
     if(DEFINED ENV{GEMM_CONFIG_FILE} AND NOT "$ENV{GEMM_CONFIG_FILE}" STREQUAL "")
         set(config_filename "$ENV{GEMM_CONFIG_FILE}")
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
-        message(STATUS "  Using config from environment variable: ${config_filename}")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
     elseif(NOT "${GEMM_CONFIG_FILE}" STREQUAL "")
         # Use CMake variable if set
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${GEMM_CONFIG_FILE}")
-        message(STATUS "  Using custom config: ${GEMM_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${GEMM_CONFIG_FILE}")
     else()
         # Use default config for all layouts
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
-        message(STATUS "  Using default config for layout ${layout}")
+        message(VERBOSE "  Using default config for layout ${layout}")
     endif()
 
     # Check if config file exists
@@ -151,16 +151,16 @@ function(build_individual_gemm_targets datatype layout)
     endif()
 
     # Generate individual kernel files using parallel version
-    message(STATUS "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
-    message(STATUS "  Working path: ${working_path}")
-    message(STATUS "  Config file: ${json_blob}")
-    message(STATUS "  Python executable: ${Python3_EXECUTABLE}")
-    message(STATUS "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py")
+    message(VERBOSE "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
+    message(VERBOSE "  Working path: ${working_path}")
+    message(VERBOSE "  Config file: ${json_blob}")
+    message(VERBOSE "  Python executable: ${Python3_EXECUTABLE}")
+    message(VERBOSE "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py")
 
     # Create working directory first
     file(MAKE_DIRECTORY ${working_path})
 
-    message(STATUS "COMMAND: ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+    message(VERBOSE "COMMAND: ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
                 --working_path ${working_path}
                 --datatype ${datatype}
                 --layout ${layout}
@@ -169,7 +169,7 @@ function(build_individual_gemm_targets datatype layout)
                 --list_kernels ")
 
     # First, just list the kernels (fast operation)
-    message(STATUS "  Listing kernel configurations...")
+    message(VERBOSE "  Listing kernel configurations...")
     execute_process(
         COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
                 --working_path ${working_path}
@@ -192,7 +192,7 @@ function(build_individual_gemm_targets datatype layout)
     if(EXISTS ${working_path}/gemm_kernel_count.txt)
         file(READ ${working_path}/gemm_kernel_count.txt kernel_count)
         string(STRIP "${kernel_count}" kernel_count)
-        message(STATUS "  Found ${kernel_count} kernel configurations")
+        message(VERBOSE "  Found ${kernel_count} kernel configurations")
     else()
         message(FATAL_ERROR "Kernel count file not found")
     endif()
@@ -216,10 +216,10 @@ function(build_individual_gemm_targets datatype layout)
 endfunction()
 
 # Main build logic - Only individual builds supported
-message(STATUS "=== Starting Tile Engine GEMM Configuration ===")
-message(STATUS "GEMM_DATATYPE: ${GEMM_DATATYPE}")
-message(STATUS "GEMM_LAYOUT: ${GEMM_LAYOUT}")
-message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+message(VERBOSE "=== Starting Tile Engine GEMM Configuration ===")
+message(VERBOSE "GEMM_DATATYPE: ${GEMM_DATATYPE}")
+message(VERBOSE "GEMM_LAYOUT: ${GEMM_LAYOUT}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx90a, gfx942, gfx950, gfx1201
 set(GEMM_GPU_TARGETS_INDIVIDUAL "")
@@ -228,15 +228,15 @@ set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
         list(APPEND GEMM_GPU_TARGETS_INDIVIDUAL ${target})
-        message(STATUS "  Adding GPU target: ${target}")
+        message(VERBOSE "  Adding GPU target: ${target}")
     endif()
 endforeach()
 
 # Skip build if no matching targets found
 if(NOT GEMM_GPU_TARGETS_INDIVIDUAL)
     message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
-    message(STATUS "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")
+    message(VERBOSE "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")
 
     # Enable parallel compilation optimizations
     # Set up job pools for better parallel compilation control
@@ -251,12 +251,12 @@ else()
         find_program(CCACHE_PROGRAM ccache)
         if(CCACHE_PROGRAM)
             set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-            message(STATUS "Using ccache for faster compilation")
+            message(VERBOSE "Using ccache for faster compilation")
         else()
             message(WARNING "ccache requested but not found")
         endif()
     else()
-        message(STATUS "ccache disabled for GEMM ops (use -DENABLE_CCACHE_GEMM=ON to enable)")
+        message(VERBOSE "ccache disabled for GEMM ops (use -DENABLE_CCACHE_GEMM=ON to enable)")
     endif()
 
     # Create master collection targets
 
@@ -8,12 +8,30 @@
 import concurrent.futures
 from pathlib import Path
 import logging
-from commons.validation_utils import (
-    is_tile_config_valid,
-    is_trait_combination_valid,
-    get_dtype_string,
-    get_abc_layouts,
-)
+import importlib.util
+
+
+def _import_validation_utils():
+    """Import validation utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
+
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "validation_utils", os.path.join(parent_dir, "commons", "validation_utils.py")
+    )
+    validation_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(validation_utils)
+
+    return validation_utils
+
+
+# Import validation functions
+_validation_utils = _import_validation_utils()
+is_tile_config_valid = _validation_utils.is_tile_config_valid
+is_trait_combination_valid = _validation_utils.is_trait_combination_valid
+get_dtype_string = _validation_utils.get_dtype_string
+get_abc_layouts = _validation_utils.get_abc_layouts
 
 logging.basicConfig(level=logging.INFO)
 
@@ -563,6 +581,8 @@ def generate_individual(self, num_workers=None):
         tile_configs = self._get_tile_configs()
         trait_combos = self._generate_trait_combinations()
         k_block_per_cu = self.config.get("k_block_per_cu")
+        if k_block_per_cu is None:
+            k_block_per_cu = 1
 
         # Prepare work items for parallel processing
         work_items = []
@@ -574,11 +594,12 @@ def generate_individual(self, num_workers=None):
                         trait_combo,
                         k_block_per_cu,
                         self.working_path,
+                        self.gpu_target,
                         self.datatype,
                         self.layout,
+                        self.config_json,
                     )
                 )
-
         print(
             f"Generating {len(work_items)} individual kernel files using {num_workers} workers..."
         )
@@ -615,7 +636,6 @@ def generate_individual(self, num_workers=None):
                     print(
                         f"  Progress: {completed}/{len(work_items)} kernels generated"
                     )
-
                 try:
                     result = future.result()
                     if result:
@@ -662,10 +682,19 @@ def _generate_cmake_individual_targets(self, kernel_list):
 
 def _generate_single_kernel_individual(work_item):
     """Worker function to generate a single individual kernel file"""
-    tile_config, trait_combo, k_block_per_cu, working_path, datatype, layout = work_item
+    (
+        tile_config,
+        trait_combo,
+        k_block_per_cu,
+        working_path,
+        gpu_target,
+        datatype,
+        layout,
+        config_json,
+    ) = work_item
 
     # Create a temporary builder instance for this worker
-    builder = GemmKernelBuilder(working_path, datatype, layout)
+    builder = GemmKernelBuilder(working_path, gpu_target, datatype, layout, config_json)
 
     try:
         kernel_name, instance_code = builder._generate_kernel_instance(
@@ -798,6 +827,8 @@ def main():
         )
 
         k_block_per_cu = builder.config.get("k_block_per_cu")
+        if k_block_per_cu is None:
+            k_block_per_cu = 1
 
         # Generate the kernel
         kernel_name, instance_code = builder._generate_kernel_instance(