diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir
new file mode 100644
index 000000000..76ef7dd63
--- /dev/null
+++ b/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir
@@ -0,0 +1,17 @@
+// input ${M}x${K}x32x64x${TYPE1}
+// input ${N}x${K}x64x32x${TYPE1}
+
+func.func @matmul4d(%arg0: tensor<${M}x${K}x32x64x${TYPE1}>, %arg1: tensor<${N}x${K}x64x32x${TYPE1}>) -> tensor<${N}x${M}x32x32x${TYPE2}> {
+  %cst = arith.constant ${ZERO} : ${TYPE2}
+  %0 = tensor.empty() : tensor<${N}x${M}x32x32x${TYPE2}>
+  %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${N}x${M}x32x32x${TYPE2}>) -> tensor<${N}x${M}x32x32x${TYPE2}>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<${M}x${K}x32x64x${TYPE1}>, tensor<${N}x${K}x64x32x${TYPE1}>) outs(%1 : tensor<${N}x${M}x32x32x${TYPE2}>) {
+    ^bb0(%in: ${TYPE1}, %in_1: ${TYPE1}, %out: ${TYPE2}):
+      %12 = ${EXT} %in : ${TYPE1} to ${TYPE2}
+      %13 = ${EXT} %in_1 : ${TYPE1} to ${TYPE2}
+      %14 = ${MUL} %12, %13 : ${TYPE2}
+      %15 = ${ADD} %out, %14 : ${TYPE2}
+      linalg.yield %15 : ${TYPE2}
+    } -> tensor<${N}x${M}x32x32x${TYPE2}>
+  return %2 : tensor<${N}x${M}x32x32x${TYPE2}>
+}
diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
index adc42cfb9..7fdd36b6d 100644
--- a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
+++ b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
@@ -1,6 +1,4 @@
-import sys
 import re
-import os
 
 
 def get_higher_order_element_type(element_type):
@@ -29,6 +27,8 @@ def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b
     acc_is_int = acc_type[0] == "i"
     replace["ZERO"] = 0 if acc_is_int else 0.0
     replace["ADD"] = "arith.addi" if acc_is_int else "arith.addf"
+    replace["MUL"] = "arith.muli" if acc_is_int else "arith.mulf"
+    replace["EXT"] = "arith.extsi" if acc_is_int else "arith.extf"
 
     key_map = map(lambda s: "${" + s + "}", replace.keys())
     key_map_escaped = map(re.escape, key_map)
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 3b8c585ef..adc310cbe 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -19,7 +19,6 @@
 from input_generator import (
     generate_inputs,
     verify_determinism,
-    load_input,
     get_output_type,
     np_from_binfile,
 )
@@ -486,6 +485,59 @@ def _execute(self, config):
         return self.vs_cpu(config)
 
 
+class Matmul4d(BaseMatmul):
+    """
+    A test of linalg.generic with 4d inputs and output implementing form:
+    C += matmul4d(A,B) where A:MxKxM0xK0, B:NxKxK0xN0, C:NxMxM0xN0
+
+    Note that the outer dims for this operation are transposed to make sure
+    successful compilation through LogicalObjectFifo pipeline.
+    For comparison purpose, the input values of inner dims M0/N0/K0 are
+    fixed as 32/32/64 currently.
+    TODO(vivian): Generalize the class and the template.
+    """
+
+    def __init__(
+        self,
+        M,
+        N,
+        K,
+        input_type,
+        acc_type,
+        additional_labels=None,
+        n_kernel_runs=1,
+        test_params=None,
+    ):
+        super().__init__(
+            name=f"matmul4d_{M}_{N}_{K}_{input_type}_{acc_type}",
+            test_params=test_params,
+            M=M,
+            N=N,
+            K=K,
+            input_type=input_type,
+            acc_type=acc_type,
+            function_name="matmul4d",
+            n_kernel_runs=n_kernel_runs,
+        )
+        self.labels.append("Matmul4d")
+        if additional_labels:
+            self.labels += additional_labels
+        if self.run_benchmark:
+            self.aie_compilation_flags += [
+                "--iree-amdaie-enable-infinite-loop-around-core-block=true"
+            ]
+            self.labels.append("Matmul4dBenchmark")
+
+    def _execute(self, config):
+        matmul_template_dir = config.file_dir / "matmul_template"
+        template_name = matmul_template_dir / "matmul4d_MxKxM0xK0_NxKxK0xN0.mlir"
+        self.generate(config, template_name)
+        if self.run_benchmark:
+            return self.benchmark(config)
+
+        return self.vs_cpu(config)
+
+
 class MatmulThinBias(BaseMatmul):
     """
     A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:N
@@ -502,9 +554,11 @@ def __init__(
     ):
         super().__init__(
             name=f"matmul_thin_bias_{M}_{N}_{K}_{input_type}_{acc_type}",
-            test_params=test_params
-            if test_params is not None
-            else TestParams(lower_to_aie_pipeline="air"),
+            test_params=(
+                test_params
+                if test_params is not None
+                else TestParams(lower_to_aie_pipeline="air")
+            ),
             M=M,
             N=N,
             K=K,
@@ -543,9 +597,11 @@ def __init__(
     ):
         super().__init__(
             name=f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}",
-            test_params=test_params
-            if test_params is not None
-            else TestParams(lower_to_aie_pipeline="air"),
+            test_params=(
+                test_params
+                if test_params is not None
+                else TestParams(lower_to_aie_pipeline="air")
+            ),
             M=M,
             N=N,
             K=K,
@@ -565,8 +621,7 @@ def _execute(self, config):
                 "--iree-amdaie-num-cols=2",
             ]
         )
-        self.vs_cpu(config)
-        return True
+        return self.vs_cpu(config)
 
 
 class BatchMatmul(BaseMatmul):
@@ -2194,6 +2249,21 @@ def __init__(self):
                 "transpose_b": False,
                 "tile_pipeline": "pack-peel-4-level-tiling",
             },
+            # matmul4d test where the input M/N/K are outer dim values.
+            # The total input values correspond to a standard matmul
+            # from the above test are M:512, N:4096, K:512.
+            {
+                "M": 16,
+                "N": 128,
+                "K": 8,
+                "use_ukernel": True,
+                "peano_opt_level": 3,
+                "outline": "balanced",
+                "transpose_a": False,
+                "transpose_b": False,
+                "matmul4d": True,
+                "tile_pipeline": "pack-peel-4-level-tiling",
+            },
             # Test where the compute is omitted, this should help triangulate
             # how much performance gain can be obtained with better matmul
             # on core vs data movement.
@@ -2257,6 +2327,24 @@ def __init__(self):
                 "tile_pipeline": "pack-peel-4-level-tiling",
                 "run_on_target": "npu4",
             },
+            # matmul4d test where the input M/N/K are outer dim values.
+            # The total input values correspond to a standard matmul
+            # from the above test are M:512, N:4096, K:512.
+            {
+                "M": 16,
+                "N": 128,
+                "K": 8,
+                "in_dtype": "i8",
+                "out_dtype": "i32",
+                "use_ukernel": True,
+                "peano_opt_level": 3,
+                "outline": "all",
+                "transpose_a": False,
+                "transpose_b": False,
+                "matmul4d": True,
+                "tile_pipeline": "pack-peel-4-level-tiling",
+                "run_on_target": "npu4",
+            },
             {
                 "M": 512,
                 "N": 4096,
@@ -2301,6 +2389,7 @@ def __init__(self):
             transpose_a = test["transpose_a"]
             transpose_b = test["transpose_b"]
             tile_pipeline = test["tile_pipeline"]
+            matmul4d = test["matmul4d"] if "matmul4d" in test else False
             run_on_target = (
                 test["run_on_target"] if "run_on_target" in test else "npu1_4col"
             )
@@ -2343,7 +2432,9 @@ def __init__(self):
                 else:
                     name_suffix += "_outline"
 
-            if (transpose_a, transpose_b) == (False, False):
+            if matmul4d:
+                TestClass = Matmul4d
+            elif (transpose_a, transpose_b) == (False, False):
                 TestClass = Matmul
             elif (transpose_a, transpose_b) == (True, False):
                 TestClass = MatmulTransposeA
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
index 40b726691..969fa9f34 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
@@ -172,7 +172,7 @@ void AMDAIEBufferizeToAllocationPass::runOnOperation() {
         !linalg::isaConvolutionOpInterface(op)) {
       return WalkResult::advance();
     }
-    if (isa<linalg::FillOp>(op)) {
+    if (isa<linalg::FillOp, linalg::CopyOp>(op)) {
       return WalkResult::advance();
     }
     // Use flag `bufferizeElementwise` to indicate whether the target for
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseConsumerIntoLoop.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseConsumerIntoLoop.cpp
index 927c9bd5d..f119b31aa 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseConsumerIntoLoop.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseConsumerIntoLoop.cpp
@@ -48,7 +48,7 @@ void AMDAIEFuseConsumerIntoLoopPass::runOnOperation() {
   // Check if there is matmul-elementwise fusion opportunity. If so, overwrite
   // the `fuseDepth` to be 2.
   funcOp->walk<WalkOrder::PostOrder, ReverseIterator>([&](linalg::LinalgOp op) {
-    if (isMatmulProducerOfElementwise(op)) {
+    if (isElementwiseWithMatmulProducer(op)) {
       fuseDepth = 2;
       return WalkResult::interrupt();
     }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp
index 8fbfd6434..eb981a073 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp
@@ -57,6 +57,13 @@ class AMDAIELowerExecutableTargetPass
 };
 }  // namespace
 
+static Operation *getRootOp(FunctionOpInterface funcOp) {
+  SmallVector<Operation *> computeOps = getComputeOps(funcOp);
+  FailureOr<Operation *> rootOp = getRootOperation(computeOps);
+  assert(succeeded(rootOp) && "Pipeline requires a root operation");
+  return rootOp.value();
+}
+
 void AMDAIELowerExecutableTargetPass::runOnOperation() {
   auto funcOp = getOperation();
   auto target = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
@@ -82,7 +89,7 @@ void AMDAIELowerExecutableTargetPass::runOnOperation() {
                  TilePassPipeline::PackPeel4LevelTilingPipeline) {
         addPackPeel4LevelTilingBasedPassPipeline(
             executableLoweringPipeline, pathToUkernels,
-            TilePassPipeline::PackPeel4LevelTilingPipeline);
+            TilePassPipeline::PackPeel4LevelTilingPipeline, getRootOp(funcOp));
       } else if (useTilePipeline == TilePassPipeline::PadPackPipeline) {
         addPadPackBasedPassPipeline(executableLoweringPipeline, pathToUkernels,
                                     enableVectorizationPasses,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
index fb54f8693..ae1ba0e09 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
@@ -468,47 +468,62 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
   MLIRContext *context = entryPointFn.getContext();
   unsigned numLoops = linalgOp.getNumLoops();
 
-  // Pack level => 1.
-  SmallVector<int64_t> packedSizesL0(numLoops, 0);
-  packedSizesL0[mDims.back()] = packPeelTiling.m0Pack;
-  packedSizesL0[nDims.back()] = packPeelTiling.n0Pack;
-  packedSizesL0[kDims.back()] = packPeelTiling.k0Pack;
-
-  // For matmul, transpose B matrix from [K N n k] to [N K k n]
-  // For matmul_transpose_b, we don't have to transpose the B matrix,
-  // since it is already [N K n k]
-  SmallVector<int64_t> transposePackIndices = {0, 1, 2};
-  // There is no corresponding unpack for the specified pack operation
-  // 0 is used when unpack is empty
-  SmallVector<bool> unpackEmpty = {false, false, true};
+  bool isBatchMatmul = isa<linalg::BatchMatmulOp>(linalgOp);
   SmallVector<int64_t> innerPermA = setInnerPermA(isMatmulTransposeA(linalgOp));
   SmallVector<int64_t> innerPermB = setInnerPermB(isMatmulTransposeB(linalgOp));
-  SmallVector<SmallVector<int64_t>> innerPerm = {
-      innerPermA, innerPermB, {0, 1}};
-  bool isBatchMatmul = isa<linalg::BatchMatmulOp>(linalgOp);
   SmallVector<int64_t> outerPermA =
       setOuterPermA(isMatmulTransposeA(linalgOp), isBatchMatmul);
   SmallVector<int64_t> outerPermB =
       setOuterPermB(isMatmulTransposeB(linalgOp), isBatchMatmul);
-  SmallVector<SmallVector<int64_t>> outerPerm = {outerPermA, outerPermB};
-  // Add outer permutation for unpack. NOTE: This currently fails for some
-  // tests in the AIR pipeline.
-  if (isa<linalg::BatchMatmulOp>(linalgOp)) {
-    outerPerm.push_back({0, 2, 1});
-  } else {
-    outerPerm.push_back({1, 0});
-  }
 
-  auto packingConfigLevel0Attr = getPackingConfigPackingLevelAttr(
-      context, packedSizesL0, transposePackIndices, unpackEmpty, innerPerm,
-      outerPerm);
+  SmallVector<int64_t> transposePackIndices;
+  SmallVector<bool> unpackEmpty;
+  SmallVector<SmallVector<int64_t>> innerPerm;
+  SmallVector<SmallVector<int64_t>> outerPerm;
+  SmallVector<PackingConfigPackingLevelAttr> packingConfigLevelsVal;
+
+  // Pack level => 1.
+  // For 2D matmul-like ops, the first level is to pack operands from 2D to 4D.
+  // If the input is a 4D matmul-like op, this level of packing is not needed.
+  bool is2DMatmulLike = is2DMatmulLikeOp(linalgOp) || isBatchMatmul;
+  if (is2DMatmulLike) {
+    SmallVector<int64_t> packedSizesL0(numLoops, 0);
+    packedSizesL0[mDims.back()] = packPeelTiling.m0Pack;
+    packedSizesL0[nDims.back()] = packPeelTiling.n0Pack;
+    packedSizesL0[kDims.back()] = packPeelTiling.k0Pack;
+
+    transposePackIndices = {0, 1, 2};
+    // There is no corresponding unpack for the specified pack operation
+    // 0 is used when unpack is empty
+    unpackEmpty = {false, false, true};
+    innerPerm = {innerPermA, innerPermB, {0, 1}};
+    outerPerm = {outerPermA, outerPermB};
+    // Add outer permutation for unpack. NOTE: This currently fails for some
+    // tests in the AIR pipeline.
+    if (isBatchMatmul) {
+      outerPerm.push_back({0, 2, 1});
+    } else {
+      outerPerm.push_back({1, 0});
+    }
+
+    auto packingConfigLevel0Attr = getPackingConfigPackingLevelAttr(
+        context, packedSizesL0, transposePackIndices, unpackEmpty, innerPerm,
+        outerPerm);
+    packingConfigLevelsVal.push_back(packingConfigLevel0Attr);
+  }
 
   // Pack level => 2.
-  // The number of loops have increased by 3 due to the first level pack.
-  SmallVector<int64_t> packedSizesL1(numLoops + 3, 0);
-  packedSizesL1[mDims.back() + 3] = packPeelTiling.m1Pack;
-  packedSizesL1[nDims.back() + 3] = packPeelTiling.n1Pack;
-  packedSizesL1[kDims.back() + 3] = packPeelTiling.k1Pack;
+  // If the first level pack exists (for 2D matmul-like ops), the number of
+  // packed dimensions should increase by 3, otherwise keep the original
+  // number of loops.
+  unsigned numPackedDims = is2DMatmulLike ? numLoops + 3 : numLoops;
+  unsigned mIdx = is2DMatmulLike ? mDims.back() + 3 : mDims.back();
+  unsigned nIdx = is2DMatmulLike ? nDims.back() + 3 : nDims.back();
+  unsigned kIdx = is2DMatmulLike ? kDims.back() + 3 : kDims.back();
+  SmallVector<int64_t> packedSizesL1(numPackedDims, 0);
+  packedSizesL1[mIdx] = packPeelTiling.m1Pack;
+  packedSizesL1[nIdx] = packPeelTiling.n1Pack;
+  packedSizesL1[kIdx] = packPeelTiling.k1Pack;
 
   // Transpose A matrix from [M K m k m0 k0] to [M K k m m0 k0]
   // Transpose C matrix from [M N m n m0 n0] to [M N n m m0 n0]
@@ -519,7 +534,7 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
   // Only the third pack operation has a corresponding unpack operation
   unpackEmpty = {false, false, true};
   innerPerm = {innerPermA, innerPermB, {0, 1}};
-  if (isa<linalg::BatchMatmulOp>(linalgOp)) {
+  if (isBatchMatmul) {
     outerPerm = {{0, 1, 2, 4, 3}, {0, 1, 2, 4, 3}, {0, 1, 2, 4, 3}};
   } else {
     outerPerm = {{0, 1, 3, 2}, {0, 1, 3, 2}, {0, 1, 3, 2}};
@@ -527,9 +542,8 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
   auto packingConfigLevel1Attr = getPackingConfigPackingLevelAttr(
       context, packedSizesL1, transposePackIndices, unpackEmpty, innerPerm,
       outerPerm);
+  packingConfigLevelsVal.push_back(packingConfigLevel1Attr);
 
-  SmallVector<PackingConfigPackingLevelAttr> packingConfigLevelsVal = {
-      packingConfigLevel0Attr, packingConfigLevel1Attr};
   auto packingConfigLevels =
       PackingConfigPackingLevelsAttr::get(context, packingConfigLevelsVal);
   auto config = PackingConfigAttr::get(context, packingConfigLevels);
@@ -550,14 +564,22 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
   bool fitsInL2 = (l2SizeA + l2SizeB + l2SizeInit) <
                   (deviceModel.getMemTileSizeInBytes() * numCols);
   int64_t scaleL0 = !isBatchMatmul && fitsInL2 ? 2 : 1;
+  int64_t m0Tile = packPeelTiling.M0 * scaleL0;
+  int64_t n0Tile = packPeelTiling.N0 * scaleL0;
 
   SmallVector<int64_t> tileSizeLevel0(numLoops, 0);
-  if (isa<linalg::BatchMatmulOp>(linalgOp)) {
+  if (isBatchMatmul) {
     assert(!batchDims.empty() && "expected batch dims not empty");
     tileSizeLevel0[batchDims[0]] = 1;
   }
-  tileSizeLevel0[mDims[0]] = packPeelTiling.M0 * scaleL0;
-  tileSizeLevel0[nDims[0]] = packPeelTiling.N0 * scaleL0;
+  // For 4D matmul-like ops, only tile the outer dims.
+  // outer_tile_size = total_tile_size / inner_dim_size
+  if (is4DMatmulLikeOp(linalgOp)) {
+    m0Tile /= maybeInputDimsAndSizes.value().mSizes.back();
+    n0Tile /= maybeInputDimsAndSizes.value().nSizes.back();
+  }
+  tileSizeLevel0[mDims[0]] = m0Tile;
+  tileSizeLevel0[nDims[0]] = n0Tile;
 
   SmallVector<int64_t> tileSizeLevel1(numLoops, 0);
   tileSizeLevel1[mDims[0]] = numRows;
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index c79c87fc7..f76e98264 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -21,6 +21,7 @@
 #include "air/Transform/AffineLoopOptPass.h"
 #include "iree-amd-aie/IR/AMDAIEAttrs.h"
 #include "iree-amd-aie/Transforms/Passes.h"
+#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h"
 #include "iree-dialects/Dialect/LinalgTransform/Passes.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Utils/ToolUtils.h"
@@ -309,9 +310,13 @@ void addPackPeelBasedPassPipeline(OpPassManager &funcPassManager,
   funcPassManager.addPass(createHoistStaticallyBoundAllocationsPass());
 }
 
-void addPackPeel4LevelTilingBasedPassPipeline(
-    OpPassManager &funcPassManager, const std::string &pathToUkernels,
-    TilePassPipeline useTilePipeline) {
+void addPackPeel4LevelTilingBasedPassPipeline(OpPassManager &funcPassManager,
+                                              const std::string &pathToUkernels,
+                                              TilePassPipeline useTilePipeline,
+                                              Operation *rootOp) {
+  // Check if the root op is a 4D matmul-like operation.
+  bool is4DMatmulOp = is4DMatmulLikeOp(cast<linalg::LinalgOp>(rootOp));
+
   // First level tiling using scf.forall
   {
     AMDAIETileAndFuseOptions tileFuseOptions;
@@ -323,18 +328,32 @@ void addPackPeel4LevelTilingBasedPassPipeline(
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
 
-  // First level packing
-  {
-    AMDAIEPackAndTransposeOptions packOptions;
-    packOptions.packLevel = 0;
-    funcPassManager.addPass(createAMDAIEPackAndTransposePass(packOptions));
+  // First level pack or pad operation for data movement.
+  // For 2D matmul-like ops, pack operation is used to expand operands from 2D
+  // to 4D. For 4D matmul-like ops, pad operation is used to keep the original
+  // dimensions.
+  if (is4DMatmulOp) {
+    // First level pad
+    {
+      AMDAIEPadOptions padOptions;
+      padOptions.paddingLevel = 0;
+      funcPassManager.addPass(createAMDAIEPadPass(padOptions));
+    }
+    funcPassManager.addPass(createCanonicalizerPass());
+    funcPassManager.addPass(createCSEPass());
+  } else {
+    // First level packing
+    {
+      AMDAIEPackAndTransposeOptions packOptions;
+      packOptions.packLevel = 0;
+      funcPassManager.addPass(createAMDAIEPackAndTransposePass(packOptions));
+    }
+    // Propagate pack ops for the elementwise op
+    funcPassManager.addPass(createAMDAIEPropagateDataLayoutPass());
+    funcPassManager.addPass(createCanonicalizerPass());
+    funcPassManager.addPass(createCSEPass());
   }
 
-  // Propagate pack ops for the elementwise op
-  funcPassManager.addPass(createAMDAIEPropagateDataLayoutPass());
-  funcPassManager.addPass(createCanonicalizerPass());
-  funcPassManager.addPass(createCSEPass());
-
   // Promote the matmul output to shared memory
   {
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
@@ -354,10 +373,11 @@ void addPackPeel4LevelTilingBasedPassPipeline(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
   }
 
-  // Second level packing
+  // If the input is 4D matmul-like op, this is the first level of packing.
+  // Otherwise for 2D matmul-like op, it is the second level.
   {
     AMDAIEPackAndTransposeOptions packOptions;
-    packOptions.packLevel = 1;
+    packOptions.packLevel = is4DMatmulOp ? 0 : 1;
     funcPassManager.addPass(createAMDAIEPackAndTransposePass(packOptions));
   }
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index dd4468f14..a01fe71a0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -55,7 +55,8 @@ void addPackPeelBasedPassPipeline(OpPassManager &passManager,
 /// 4 levels of tiling.
 void addPackPeel4LevelTilingBasedPassPipeline(OpPassManager &passManager,
                                               const std::string &pathToUkernels,
-                                              TilePassPipeline useTilePipeline);
+                                              TilePassPipeline useTilePipeline,
+                                              Operation *rootOp);
 
 /// Populates passes needed to lower the IR via a Pad-Pack based approach.
 void addPadPackBasedPassPipeline(OpPassManager &passManager,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp
index 8ad6c0e4a..ead1c79b9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp
@@ -148,14 +148,11 @@ static bool bodyMatcherForMatmulLikeOps(Value yieldVal, Block *body) {
 }
 
 /// Utility to check if the input generic op is a 2D matmul-like op.
-static bool is2DMatmulLikeOp(linalg::LinalgOp linalgOp) {
+bool is2DMatmulLikeOp(linalg::LinalgOp linalgOp) {
   // Check iterator types.
-  SmallVector<utils::IteratorType> matmulIteratorTypes = {
-      utils::IteratorType::parallel, utils::IteratorType::parallel,
-      utils::IteratorType::reduction};
-  SmallVector<utils::IteratorType> opIteratorTypes =
-      linalgOp.getIteratorTypesArray();
-  if (matmulIteratorTypes != opIteratorTypes) return false;
+  unsigned numParallelLoops = linalgOp.getNumParallelLoops();
+  unsigned numReductionLoops = linalgOp.getNumReductionLoops();
+  if (numParallelLoops != 2 || numReductionLoops != 1) return false;
 
   // Check the number of inputs and results from indexing maps.
   ArrayAttr indexingMaps = linalgOp.getIndexingMaps();
@@ -174,15 +171,11 @@ static bool is2DMatmulLikeOp(linalg::LinalgOp linalgOp) {
 }
 
 /// Utility to check if the input generic op is a 4D matmul-like op.
-static bool is4DMatmulLikeOp(linalg::LinalgOp linalgOp) {
+bool is4DMatmulLikeOp(linalg::LinalgOp linalgOp) {
   // Check iterator types.
-  SmallVector<utils::IteratorType> matmulIteratorTypes = {
-      utils::IteratorType::parallel,  utils::IteratorType::parallel,
-      utils::IteratorType::reduction, utils::IteratorType::parallel,
-      utils::IteratorType::parallel,  utils::IteratorType::reduction};
-  SmallVector<utils::IteratorType> opIteratorTypes =
-      linalgOp.getIteratorTypesArray();
-  if (matmulIteratorTypes != opIteratorTypes) return false;
+  unsigned numParallelLoops = linalgOp.getNumParallelLoops();
+  unsigned numReductionLoops = linalgOp.getNumReductionLoops();
+  if (numParallelLoops != 4 || numReductionLoops != 2) return false;
 
   // Check indexing maps.
   ArrayAttr indexingMaps = linalgOp.getIndexingMaps();
@@ -201,17 +194,11 @@ static bool is4DMatmulLikeOp(linalg::LinalgOp linalgOp) {
 }
 
 /// Utility to check if the input generic op is a 6D matmul-like op.
-static bool is6DMatmulLikeOp(linalg::LinalgOp linalgOp) {
+bool is6DMatmulLikeOp(linalg::LinalgOp linalgOp) {
   // Check iterator types.
-  SmallVector<utils::IteratorType> matmulIteratorTypes = {
-      utils::IteratorType::parallel,  utils::IteratorType::parallel,
-      utils::IteratorType::reduction, utils::IteratorType::parallel,
-      utils::IteratorType::parallel,  utils::IteratorType::reduction,
-      utils::IteratorType::parallel,  utils::IteratorType::parallel,
-      utils::IteratorType::reduction};
-  SmallVector<utils::IteratorType> opIteratorTypes =
-      linalgOp.getIteratorTypesArray();
-  if (matmulIteratorTypes != opIteratorTypes) return false;
+  unsigned numParallelLoops = linalgOp.getNumParallelLoops();
+  unsigned numReductionLoops = linalgOp.getNumReductionLoops();
+  if (numParallelLoops != 6 || numReductionLoops != 3) return false;
 
   // Check indexing maps.
   ArrayAttr indexingMaps = linalgOp.getIndexingMaps();
@@ -340,8 +327,9 @@ bool isMatmulInDefChain(Value operand) {
 
 /// Utility to identify if `linalgOp` is an elementwise operation with a
 /// matmul-like op upstream in its computation tree.
-bool isMatmulProducerOfElementwise(linalg::LinalgOp linalgOp) {
-  if (!linalg::isElementwise(linalgOp) || isa<linalg::FillOp>(linalgOp)) {
+bool isElementwiseWithMatmulProducer(linalg::LinalgOp linalgOp) {
+  if (!linalg::isElementwise(linalgOp) ||
+      isa<linalg::FillOp, linalg::CopyOp>(linalgOp)) {
     return false;
   }
   // Check if any of the defining op is a matmul-like op.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h
index f7dc69a42..663e89bf6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h
@@ -67,13 +67,19 @@ bool isMatmulTransposeA(linalg::LinalgOp linalgOp);
 /// Utility to identify whether a linalg op is a matmul_transpose_b op.
 bool isMatmulTransposeB(linalg::LinalgOp linalgOp);
 
+/// Utility to identify whether a linalg op is a 2D matmul-like op.
+bool is2DMatmulLikeOp(linalg::LinalgOp linalgOp);
+
+/// Utility to identify whether a linalg op is a 4D matmul-like op.
+bool is4DMatmulLikeOp(linalg::LinalgOp linalgOp);
+
 /// Utility to identify if the input operand has matmul-like op in its
 /// def-chain.
 bool isMatmulInDefChain(Value operand);
 
 /// Utility to identify if `linalgOp` is an elementwise operation with a
 /// matmul-like op upstream in its computation tree.
-bool isMatmulProducerOfElementwise(linalg::LinalgOp linalgOp);
+bool isElementwiseWithMatmulProducer(linalg::LinalgOp linalgOp);
 
 /// Utility to convert a `uint32_t` value into a hex string.
 std::string utohexstr(uint32_t value, size_t width, bool header = true,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir
index f8bbb02f5..b33071f1b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir
@@ -1,4 +1,5 @@
 // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy)' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{use-tile-pipeline=pack-peel-4-level-tiling})' %s | FileCheck %s --check-prefix=PACK-PEEL-4-LEVEL
 
 // Test generic version of matmul.
 
@@ -115,3 +116,35 @@ module {
     return
   }
 }
+
+// -----
+
+// Test generic version of matmul with 4d inputs and output.
+
+// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[8, 8, 0, 0, 0, 0], [4, 4, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [1, 1, 0, 0, 0, 0]]>
+// PACK-PEEL-4-LEVEL{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [0, 0, 4, 4, 0, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
+module {
+  func.func @mmt4d_dispatch_0_matmul_like_16x128x32x32x8x64_bf16xbf16xf32() {
+    %cst = arith.constant 0.000000e+00 : f32
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<16x8x32x64xbf16>>
+    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x8x64x32xbf16>>
+    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x16x32x32xf32>>
+    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [16, 8, 32, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x8x32x64xbf16>> -> tensor<16x8x32x64xbf16>
+    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [128, 8, 64, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x8x64x32xbf16>> -> tensor<128x8x64x32xbf16>
+    %5 = tensor.empty() : tensor<128x16x32x32xf32>
+    %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x16x32x32xf32>) -> tensor<128x16x32x32xf32>
+    //      PACK-PEEL-4-LEVEL:  linalg.generic
+    // PACK-PEEL-4-LEVEL-SAME:  attrs = {lowering_config = #config, packing_config = #packingConfig}
+    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d4, d5, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%3, %4 : tensor<16x8x32x64xbf16>, tensor<128x8x64x32xbf16>) outs(%6 : tensor<128x16x32x32xf32>) {
+    ^bb0(%in: bf16, %in_0: bf16, %out: f32):
+      %8 = arith.extf %in : bf16 to f32
+      %9 = arith.extf %in_0 : bf16 to f32
+      %10 = arith.mulf %8, %9 : f32
+      %11 = arith.addf %out, %10 : f32
+      linalg.yield %11 : f32
+    } -> tensor<128x16x32x32xf32>
+    flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [128, 16, 32, 32], strides = [1, 1, 1, 1] : tensor<128x16x32x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x16x32x32xf32>>
+    return
+  }
+}