From af8016e95f0d7d9b627c033deb6d97a9ee59dcd6 Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Sun, 12 Jan 2025 23:24:11 -0800
Subject: [PATCH] [CI] Use pre-commit (copy IREE) for linting (#1020)

This commit aligns iree-amd-aie's linting with IREE's. It copies files
https://github.com/iree-org/iree/blob/main/.pre-commit-config.yaml and
https://github.com/iree-org/iree/blob/main/.github/workflows/lint.yml ,
with the only change being a removal of basel and yml linting, as we
don't have basel or yml in iree-amd-aie (and the basel linter was
failing).

This commit also includes the changes needed to pass the linter. Running
`pre-commit run --all-files` basically did everything needed, with 2
places where a `clang-format off/on` pair was needed to avoid header
rearrangement.

Note to developers: if your PR fails linting after this lands, it's very
easy to fix locally: `pip install pre-commit` and then run pre-commit on
all or a subset of the project files.
---
 .github/CODEOWNERS                            |   1 -
 .github/workflows/black.yml                   |  12 -
 .github/workflows/ci-windows.yml              |   4 +-
 .github/workflows/lint.yml                    |  29 ++
 .gitignore                                    |   2 +-
 .pre-commit-config.yaml                       |  54 +++
 LICENSE                                       |   1 -
 README.md                                     |  13 +-
 .../convolution_generator.py                  |   1 -
 .../ci/cpu_comparison/input_generator.py      |   1 -
 build_tools/ci/cpu_comparison/run.py          |   7 -
 build_tools/ci/reset_npu.sh                   |   1 -
 build_tools/ci/run_matmul_test.sh             |   4 +-
 build_tools/clang_llvm_tools_not_to_build.txt |   2 +-
 cmake/iree_aie_utils.cmake                    |   1 -
 .../xdna-oplib/Transforms/PassDetail.h        |   5 +-
 .../xdna-oplib/Transforms/Passes.cpp          |   3 +-
 .../XDNA-OPLIB/xdna-oplib/Transforms/Passes.h |   7 +-
 .../plugins/target/AMD-AIE/aie/AIEDialect.cpp |   7 +-
 .../plugins/target/AMD-AIE/aie/AIEDialect.h   |   6 +-
 .../plugins/target/AMD-AIE/aie/AIEEnums.h     |   2 +
 compiler/plugins/target/AMD-AIE/aie/AIEOps.td |   2 +-
 .../aie/AMDAIENormalizeAddressSpaces.cpp      |   2 +-
 .../aie/test/aie2_memtile_connection.mlir     |   6 +-
 .../target/AMD-AIE/aie/test/local_locks.mlir  |   1 -
 .../AMD-AIE/aie/test/normalize_call_op.mlir   |   2 +-
 .../AMD-AIE/aie/test/test_congestion0.mlir    |  16 +-
 .../test/test_create_packet_flows_shim0.mlir  |   2 +-
 .../test/test_create_packet_flows_shim1.mlir  |   2 +-
 .../aie/test/trace_packet_routing.mlir        |   6 +-
 .../aie/test/unit_simple_flows_shim.mlir      |   2 -
 .../target/AMD-AIE/aievec/AIEVecOps.cpp       |  26 +-
 .../target/AMD-AIE/aievec/AIEVecToLLVM.cpp    |  17 +-
 .../aievec/VectorToVectorConversions.cpp      |   3 +-
 .../target/AMD-AIE/aievec/XLLVMDialect.h      |   2 +-
 .../target/AMD-AIE/aievec/test/test-srs.mlir  |  28 +-
 .../target/AMD-AIE/aievec/test/test-ups.mlir  |  50 +--
 .../plugins/target/AMD-AIE/air/CMakeLists.txt |   2 +-
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td    |  10 +-
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEDialect.td  |   2 +-
 .../iree-amd-aie/IR/AMDAIEDmaOpInterface.cpp  |   6 +-
 .../iree-amd-aie/IR/AMDAIEDmaOpInterface.h    |   1 -
 .../iree-amd-aie/IR/AMDAIEDmaOpInterface.td   |   6 +-
 .../IR/AMDAIELogicalObjFifoOpInterface.td     |   2 +-
 .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td      | 202 +++++-----
 .../AMD-AIE/iree-amd-aie/IR/AMDAIETypes.cpp   |   3 +-
 .../AMD-AIE/iree-amd-aie/IR/AMDAIETypes.h     |   5 +-
 .../AMD-AIE/iree-amd-aie/IR/AMDAIETypes.td    |  12 +-
 .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp |   1 -
 .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.h   |   2 +-
 .../AMD-AIE/iree-amd-aie/Target/mm_npu4.cc    |   2 +-
 .../iree-amd-aie/Target/test/bd_chaining.mlir | 374 +++++++++---------
 ...8xi32__dispatch_0_matmul_16x1_0.aiecc.mlir |  12 +-
 ...32xi8__dispatch_0_matmul_tran_0.aiecc.mlir |  12 +-
 ...64xbf16__dispatch_0_matmul_64_0.aiecc.mlir |  12 +-
 ...64xi8__dispatch_0_matmul_64x6_0.aiecc.mlir |  12 +-
 ...6xi32__dispatch_0_matmul_8x32_0.aiecc.mlir |  12 +-
 .../iree-amd-aie/Target/test/regenerate.sh    |   2 +-
 .../matmul_fill_spec_pad_pack.mlir            |   2 +-
 .../matmul_fill_spec_simple_pack.mlir         |   1 -
 .../AMDAIEControlCodeForallToFor.cpp          |   3 +-
 .../AMDAIEConvertCoreForallToFor.cpp          |   2 +-
 .../Transforms/AMDAIECreateAIEWorkgroup.cpp   |   2 +-
 .../AMDAIEDistributeCoresAndObjectFifos.cpp   |   4 +-
 .../Transforms/AMDAIEDmaLoopSubsumption.cpp   |   2 +-
 .../AMDAIEInsertLoopsForVectorization.cpp     |  14 +-
 .../AMDAIELinalgFunctionOutlining.cpp         |   1 -
 .../Transforms/AMDAIELoadAlignmentReset.cpp   |  16 +-
 .../Transforms/AMDAIELowerToUKernels.cpp      |   3 +-
 .../Transforms/AMDAIESinkIntoCore.cpp         |   3 +-
 .../Transforms/BridgeToAIRPass.cpp            |   3 +-
 .../DecomposeLinalgExtPackUnPackToAIR.cpp     |   3 +-
 .../iree-amd-aie/Transforms/KernelDispatch.h  |  13 +-
 .../Transforms/test/AMDAIEUtilsTest.cpp       |   5 +-
 .../test/access_to_acquire_release.mlir       |   1 -
 .../test/assign_npu_dma_bd_ids.mlir           |   8 +-
 .../Transforms/test/combine_strided_ops.mlir  |   4 +-
 .../Transforms/test/controlcode_lowering.mlir |   2 +-
 .../test/convert_to_dma_failures.mlir         |   1 -
 .../Transforms/test/create_aie_workgroup.mlir |   2 +-
 .../Transforms/test/dma_loop_subsumption.mlir |  12 +-
 .../test/dma_loop_subsumption_circular.mlir   |   2 +-
 .../Transforms/test/fold_dma_waits.mlir       |   4 +-
 .../test/fuse_consumer_into_loop.mlir         |   6 +-
 .../Transforms/test/fuse_pack_into_loop.mlir  |   3 -
 .../test/generate_control_overlay.mlir        |   6 +-
 .../Transforms/test/insert_dma_bd_chain.mlir  |   2 +-
 ...nsert_infinite_loop_around_core_block.mlir |   2 +-
 .../test/insert_loops_for_vectorization.mlir  |   2 +-
 .../test/linalg_function_outlining.mlir       |   1 -
 .../test/localize_logical_objectfifo.mlir     |   1 -
 .../Transforms/test/lower_func_args.mlir      |   1 -
 .../Transforms/test/lower_to_aie.mlir         |  14 +-
 .../test/lower_workgroup_count.mlir           |   2 +-
 .../test/lowering_strategy_air.mlir           |  12 +-
 .../test/lowering_strategy_failures.mlir      |   2 -
 .../Transforms/test/pack_to_air.mlir          |  14 +-
 .../Transforms/test/sink_into_core.mlir       |   1 -
 .../test/split_logicalobjfifos.mlir           |   8 +-
 ..._logicalobjfifos_for_connection_reuse.mlir |  24 +-
 ...and_fuse_convolution_using_scf_forall.mlir |   2 -
 .../test/tile_copy_using_scf_for.mlir         |   1 -
 .../test/unsupported_pipelines.mlir           |   2 -
 experimental/delegate/README.md               |   6 +-
 experimental/delegate/large-matmul-f32.mlir   |   1 -
 .../delegate/large-matmul-f32.pdl.mlir        |   6 +-
 experimental/delegate/large-matmul.pdl.mlir   |   6 +-
 experimental/delegate/linalg.pdl.mlir         |   6 +-
 experimental/delegate/matmul-16k.pdl.mlir     |   8 +-
 experimental/delegate/mlp_aie_bf16_plugin.cpp | 372 ++++++++---------
 .../delegate/mlp_spec_matmul_elementwise.mlir |  10 +-
 experimental/delegate/opt.mlir                |   2 +-
 experimental/delegate/opt.pdl.mlir            |   6 +-
 .../iree-amd-aie/aie_runtime/AMDAIEEnums.cpp  |   1 +
 .../iree-amd-aie/aie_runtime/CMakeLists.txt   |   1 -
 .../src/iree-amd-aie/aie_runtime/d_ary_heap.h |   2 +-
 .../aie_runtime/iree_aie_router.h             |   4 +-
 .../aie_runtime/iree_aie_runtime.cc           |   4 +-
 .../aie_runtime/iree_aie_runtime.h            |  37 +-
 .../aie_runtime/test/CMakeLists.txt           |   2 +-
 .../aie_runtime/test/test_amsel_generator.cc  | 177 ++++++---
 .../aie_runtime/test/test_control_packets.cc  |   6 +-
 .../src/iree-amd-aie/aie_runtime/xaie_hwcfg.c |   3 +-
 .../xrt-lite/cts/executable_cache_test.mlir   |   1 -
 .../xrt-lite/shim/linux/kmq/amdxdna_accel.h   |  14 +-
 .../driver/xrt-lite/shim/linux/kmq/ert.h      |  25 +-
 runtime/src/iree-amd-aie/driver/xrt/api.h     |   2 +-
 .../iree-amd-aie/driver/xrt/nop_semaphore.cc  |   8 +-
 .../iree-amd-aie/driver/xrt/nop_semaphore.h   |   2 +-
 .../driver/xrt/registration/driver_module.h   |   2 +-
 .../src/iree-amd-aie/driver/xrt/xrt_buffer.h  |   2 +-
 .../src/iree-amd-aie/driver/xrt/xrt_device.cc |   8 +-
 .../schemas/pdi_executable_def.fbs            |   4 +-
 .../schemas/xrt_executable_def.fbs            |  22 +-
 134 files changed, 1038 insertions(+), 953 deletions(-)
 delete mode 100644 .github/workflows/black.yml
 create mode 100644 .github/workflows/lint.yml
 create mode 100644 .pre-commit-config.yaml

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 138afc058..155f57d8a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -24,4 +24,3 @@
 # Target emission
 /compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETarget* @makslevental
 /compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen* @makslevental @nirvedhmeshram @newling
-
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
deleted file mode 100644
index 1d4872e71..000000000
--- a/.github/workflows/black.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: Lint
-
-on: [push, pull_request]
-
-jobs:
-  lint:
-    runs-on: nod-ai-shared-cpubuilder-manylinux-x86_64
-    steps:
-      - uses: actions/checkout@v4
-      - uses: psf/black@stable
-
-# see https://black.readthedocs.io/en/stable/integrations/github_actions.html
diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml
index c94ac12eb..d319e0faf 100644
--- a/.github/workflows/ci-windows.yml
+++ b/.github/workflows/ci-windows.yml
@@ -49,7 +49,7 @@ jobs:
           timezoneLinux: "Asia/Singapore"
           timezoneMacos: "Asia/Singapore"
           timezoneWindows: "Singapore Standard Time"
-          
+
       - name: "Checking out repository"
         env:
           BRANCH_NAME: ${{ github.ref }}
@@ -107,7 +107,7 @@ jobs:
       - name: Create artifacts
         if: ${{ !cancelled() }}
         run: |
-          pushd third_party/iree/third_party/llvm-project 
+          pushd third_party/iree/third_party/llvm-project
           $llvm_sha_short = "$(git rev-parse --short HEAD)"
           popd
           tar cf llvm-dist-windows-$llvm_sha_short.tar llvm-install
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 000000000..40773c203
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,29 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# History in iree-amd-aie: copied from IREE January 2025.
+# To apply fixes locally, something like the following should work
+# > pip-install pre-commit
+# > pre-commit run --all-files
+# Fixes of tabs (replace with space) must be done manually.
+
+name: Lint
+
+on: [pull_request]
+
+permissions:
+  contents: read
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Checking out repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Setting up python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      - name: Running pre-commit
+        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
diff --git a/.gitignore b/.gitignore
index ad03ac70c..478d04ba7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 # This file was copied from IREE #
 ##################################
 
-# iree-amd-aie specific (CI or otherwise). 
+# iree-amd-aie specific (CI or otherwise).
 *-build/
 *-install/
 kernel-doc
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..e8d68bccc
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,54 @@
+# Pre-commit (https://pre-commit.com) configuration for assorted lint checks.
+#
+# See https://pre-commit.com/hooks.html for more hooks.
+#
+# History in iree-amd-aie. January 2025: copied from IREE and trimmed down to
+# the subset of hooks that are relevant.
+
+exclude: "third_party/"
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+      - id: check-merge-conflict
+
+      - id: check-yaml
+        # * Extensions can't be included in the mkdocs schema, so skip checking
+        #   https://github.com/squidfunk/mkdocs-material/issues/6378
+        # * clang-format files use `---` to split for multiple languages,
+        #   resulting in errors like `expected a single document in the stream`
+        exclude: "mkdocs.yml|.clang-format"
+
+      - id: end-of-file-fixer
+        exclude_types: ["image", "jupyter"]
+
+      - id: trailing-whitespace
+        exclude_types: ["image", "jupyter"]
+
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
+        name: Run Black to format Python files
+
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    # Loosely track the most recent versions in
+    #   * Runner images: https://github.com/actions/runner-images/
+    #   * Editor extensions: https://github.com/microsoft/vscode-cpptools
+    rev: v18.1.3
+    hooks:
+      - id: clang-format
+        name: Run clang-format on C/C++/etc. files
+        exclude_types: ["jupyter"]
+
+  - repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.5.5
+    hooks:
+      - id: forbid-tabs
+        exclude: ".gitmodules|Makefile"
+
+  - repo: https://github.com/jlebar/pre-commit-hooks.git
+    rev: f2d115a052860b09b2888b4f104be614bf3b4779
+    hooks:
+      - id: do-not-submit
diff --git a/LICENSE b/LICENSE
index f9dc50615..bd8b243df 100644
--- a/LICENSE
+++ b/LICENSE
@@ -216,4 +216,3 @@ conflicts with the conditions of the GPLv2, you may retroactively and
 prospectively choose to deem waived or otherwise exclude such Section(s) of
 the License, but only in their entirety and only with respect to the Combined
 Software.
-
diff --git a/README.md b/README.md
index e03ef17e1..52d3fc83e 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ git \
   git@github.com:nod-ai/iree-amd-aie.git # https://github.com/nod-ai/iree-amd-aie.git
 ```
 
-The above avoids cloning entire repo histories for submodules, and skips a few, currently, unused, 
+The above avoids cloning entire repo histories for submodules, and skips a few, currently, unused,
 submodules that are nested in IREE.
 
 ### Dependencies
@@ -78,7 +78,7 @@ cmake --build <WHERE_YOU_WOULD_LIKE_TO_BUILD>
 
 ### Instructions
 
-The bare minimum configure command for IREE with the amd-aie plugin 
+The bare minimum configure command for IREE with the amd-aie plugin
 
 ```
 cmake \
@@ -106,7 +106,7 @@ you can opt-out of everything (except the `llvm-cpu` backend) with
   -DIREE_INPUT_TOSA=OFF \
   -DIREE_HAL_DRIVER_DEFAULTS=OFF \
   -DIREE_TARGET_BACKEND_DEFAULTS=OFF \
-  -DIREE_TARGET_BACKEND_LLVM_CPU=ON 
+  -DIREE_TARGET_BACKEND_LLVM_CPU=ON
 ```
 
 With the above you can also skip cloning the `stablehlo` and `torch-mlir` submodules/repos but in this case you will need to add
@@ -121,14 +121,14 @@ If you're "bringing your own LLVM", i.e., you have a prebuilt/compiled distribut
   -DIREE_BUILD_BUNDLED_LLVM=OFF
 ```
 
-In this case you will need `lit` somewhere in your environment and you will need to add to CMake `-DLLVM_EXTERNAL_LIT=<SOMEWHERE>` 
+In this case you will need `lit` somewhere in your environment and you will need to add to CMake `-DLLVM_EXTERNAL_LIT=<SOMEWHERE>`
 (e.g., `pip install lit; SOMEWHERE=$(which lit)`).
 
 See [Bringing your own LLVM](#bringing-your-own-llvm) below for more information on using prebuilt/compiled distributions of LLVM.
 
 ## Testing
 
-Lit tests (i.e., compiler tests) specific to AIE can be run with something like 
+Lit tests (i.e., compiler tests) specific to AIE can be run with something like
 
 ```
 cd <WHERE_YOU_WOULD_LIKE_TO_BUILD>
@@ -137,7 +137,7 @@ ctest -R amd-aie --output-on-failure -j 10
 
 (the `-j 10` runs `10` tests in parallel)
 
-Other tests, which run on device, are in the `build_tools` subdirectory. 
+Other tests, which run on device, are in the `build_tools` subdirectory.
 See [build_tools/ci/run_all_runtime_tests.sh](build_tools/ci/run_all_runtime_tests.sh) for an example script that shows how to run all the runtime tests.
 
 ## Pro-tips
@@ -181,4 +181,3 @@ Note, this is roughly equivalent to [passing](https://github.com/nod-ai/iree-amd
 ## Architectural overview (out of date)
 
 ![image](https://github.com/nod-ai/iree-amd-aie/assets/74956/3fa73139-5fdf-4658-86c3-0705352c4ea0)
-
diff --git a/build_tools/ci/cpu_comparison/convolution_template/convolution_generator.py b/build_tools/ci/cpu_comparison/convolution_template/convolution_generator.py
index 2c7c16359..1ad743ce0 100644
--- a/build_tools/ci/cpu_comparison/convolution_template/convolution_generator.py
+++ b/build_tools/ci/cpu_comparison/convolution_template/convolution_generator.py
@@ -39,7 +39,6 @@ def get_output_type(self, N, OH, OW, OC, output_element_type):
 
 
 class ConvolutionMlirGenerator:
-
     def __init__(
         self,
         conv_type,
diff --git a/build_tools/ci/cpu_comparison/input_generator.py b/build_tools/ci/cpu_comparison/input_generator.py
index 09cc40801..131d71e9b 100644
--- a/build_tools/ci/cpu_comparison/input_generator.py
+++ b/build_tools/ci/cpu_comparison/input_generator.py
@@ -43,7 +43,6 @@ def bf16_to_f32(bfloat16_array):
 
 
 def generate_bfloat16_data(nb_values, lower_bound, upper_bound, rng):
-
     float_data = rng.integers(lower_bound, upper_bound, nb_values).astype(np.float32)
 
     # Convert float32 data to bfloat16
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 7e662161e..2117554de 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -95,7 +95,6 @@ def add_aie_compilation_flags(self, flags):
             self.aie_compilation_flags = list(set(self.aie_compilation_flags))
 
     def run(self, config):
-
         # If the target device is not in the set of devices to run on, then
         # return False. ie. don't raise an error because is legitimate,
         # we just won't run the test.
@@ -237,7 +236,6 @@ def __init__(
         self.function_name = function_name
 
     def vs_cpu(self, config):
-
         filename = self.get_filename(config)
 
         if self.use_ukernel and not config.vitis_dir:
@@ -256,7 +254,6 @@ def vs_cpu(self, config):
         return True
 
     def benchmark(self, config):
-
         filename = self.get_filename(config)
 
         if self.use_ukernel and not config.vitis_dir:
@@ -277,7 +274,6 @@ def benchmark(self, config):
         return True
 
     def generate(self, config, template_name):
-
         generate_matmul_test(
             self.get_filename(config),
             template_name,
@@ -890,7 +886,6 @@ def shell_out(cmd: list, workdir=None, verbose: int = 0, raise_on_error=True, en
 
 
 def print_program_memory_size(test_dir):
-
     # Get all the .elf files in `test_dir`.
     # These elfs contain many sections, one of which is the program memory. Some digging into the elf format
     # see https://github.com/newling/aie-rt/commit/d0f08bc4a37092a919d6a0d51a44d9f0ae274bb9
@@ -1567,7 +1562,6 @@ def aie_vs_llvm_cpu(
 
 
 class Tests:
-
     def add_aie_compilation_flags(self, flags):
         for test in self.tests:
             test.add_aie_compilation_flags(flags)
@@ -2182,7 +2176,6 @@ def all_tests(
     not_match = []
 
     for test in tests.tests:
-
         skip = test.name in skip_test_set or any(
             (label in skip_test_set for label in test.labels)
         )
diff --git a/build_tools/ci/reset_npu.sh b/build_tools/ci/reset_npu.sh
index b69f24837..231e7627c 100755
--- a/build_tools/ci/reset_npu.sh
+++ b/build_tools/ci/reset_npu.sh
@@ -10,4 +10,3 @@ set -e
 sudo modprobe -r amdxdna
 sudo modprobe drm_shmem_helper
 sudo modprobe amdxdna dyndbg==pflm timeout_in_sec=10
-
diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh
index 0f5d6f141..a1bdde8d0 100755
--- a/build_tools/ci/run_matmul_test.sh
+++ b/build_tools/ci/run_matmul_test.sh
@@ -200,7 +200,7 @@ function run_matmul_test() {
   # should still be checked to compile (set num_repeat_runs=0 in this case).
   local num_repeat_runs="1"
 
-  # Run the test 'num_corruption_repeat_runs' times without an NPU reset in 
+  # Run the test 'num_corruption_repeat_runs' times without an NPU reset in
   # between. This can be used to check for corruption, i.e. the AIE might be
   # left in a bad state in between runs. Additionally, this increases the speed
   # of the repeated test
@@ -570,7 +570,7 @@ run_matmul_test \
 # ObjectFifo Matmul tests
 ###################################################################
 
-# Run repeatedly to check for non-deterministic hangs and numerical 
+# Run repeatedly to check for non-deterministic hangs and numerical
 # issues.
 repeat_shapes=(
   '32x32x32'
diff --git a/build_tools/clang_llvm_tools_not_to_build.txt b/build_tools/clang_llvm_tools_not_to_build.txt
index dadf09de3..57bd98de3 100644
--- a/build_tools/clang_llvm_tools_not_to_build.txt
+++ b/build_tools/clang_llvm_tools_not_to_build.txt
@@ -110,4 +110,4 @@ CLANG_TOOL_CLANG_SCAN_DEPS
 CLANG_TOOL_CLANG_SHLIB
 CLANG_TOOL_DIAGTOOL
 CLANG_TOOL_LIBCLANG
-CLANG_TOOL_NVPTX_ARCH
\ No newline at end of file
+CLANG_TOOL_NVPTX_ARCH
diff --git a/cmake/iree_aie_utils.cmake b/cmake/iree_aie_utils.cmake
index aa4c57027..79d6c443b 100644
--- a/cmake/iree_aie_utils.cmake
+++ b/cmake/iree_aie_utils.cmake
@@ -42,4 +42,3 @@ function(replace_string_in_file _file _match_string _replace_string)
   file(WRITE "${_file}" "${_file_contents}")
   file(LOCK "${_lock_file}" RELEASE)
 endfunction()
-
diff --git a/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/PassDetail.h b/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/PassDetail.h
index f1bca9dbf..5e2c16cbc 100644
--- a/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/PassDetail.h
+++ b/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/PassDetail.h
@@ -13,7 +13,6 @@ namespace mlir::iree_compiler::XDNAOPLIB {
 #define GEN_PASS_DEF_XDNAOPLIBHELLOWORLD
 #include "xdna-oplib/Transforms/Passes.h.inc"
 
-}
+}  // namespace mlir::iree_compiler::XDNAOPLIB
 
-
-#endif // IREE_XDNA_OPLIB_TRANSFORMS_PASSDETAIL_H_
+#endif  // IREE_XDNA_OPLIB_TRANSFORMS_PASSDETAIL_H_
diff --git a/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/Passes.cpp b/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/Passes.cpp
index e42d7883a..a4159daf8 100644
--- a/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/Passes.cpp
+++ b/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/Passes.cpp
@@ -4,9 +4,10 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "mlir/Pass/PassManager.h"
 #include "xdna-oplib/Transforms/Passes.h"
 
+#include "mlir/Pass/PassManager.h"
+
 namespace mlir::iree_compiler::XDNAOPLIB {
 
 void addXDNAOPLIBPreprocessingExtensions(OpPassManager &pm) {
diff --git a/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/Passes.h b/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/Passes.h
index 12a555336..6303874f3 100644
--- a/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/Passes.h
+++ b/compiler/plugins/preprocessing/XDNA-OPLIB/xdna-oplib/Transforms/Passes.h
@@ -15,12 +15,11 @@ namespace mlir::iree_compiler::XDNAOPLIB {
 void addXDNAOPLIBPreprocessingExtensions(OpPassManager &pm);
 
 // Hello world pass to show the XDNA OpLib is functional
-std::unique_ptr<OperationPass<>>
-createXDNAOPLIBHelloWorldPass();
+std::unique_ptr<OperationPass<>> createXDNAOPLIBHelloWorldPass();
 
 // Registration for all XDNA OpLib passes.
 void registerXDNAOPLIBPasses();
 
-}
+}  // namespace mlir::iree_compiler::XDNAOPLIB
 
-#endif // IREE_XDNA_OPLIB_TRANSFORMS_PASSES_H_
+#endif  // IREE_XDNA_OPLIB_TRANSFORMS_PASSES_H_
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEDialect.cpp b/compiler/plugins/target/AMD-AIE/aie/AIEDialect.cpp
index 42d8b865c..a9dbf0260 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AIEDialect.cpp
+++ b/compiler/plugins/target/AMD-AIE/aie/AIEDialect.cpp
@@ -489,10 +489,9 @@ LogicalResult DMABDOp::verify() {
                << "Invalid step size; must be a positive integer.";
       }
       if (dim.getStride() > bufferType.getNumElements()) {
-        return emitOpError()
-               << "Step size " << std::to_string(dim.getStride()) << " "
-               << "exceeds memref size "
-               << std::to_string(bufferType.getNumElements());
+        return emitOpError() << "Step size " << std::to_string(dim.getStride())
+                             << " " << "exceeds memref size "
+                             << std::to_string(bufferType.getNumElements());
       }
       if (dim.getSize() >= ((1UL << 10) - 1)) {
         return emitOpError() << "Size may not exceed " << ((1UL << 10) - 1);
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEDialect.h b/compiler/plugins/target/AMD-AIE/aie/AIEDialect.h
index b6e18a508..efe8abeb0 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AIEDialect.h
+++ b/compiler/plugins/target/AMD-AIE/aie/AIEDialect.h
@@ -105,9 +105,9 @@ void printObjectFifoProducerTile(mlir::OpAsmPrinter &printer,
     llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &tiles,
     BDDimLayoutArrayArrayAttr &dimensions);
 
-[[maybe_unused]] void printObjectFifoConsumerTiles(mlir::OpAsmPrinter &printer,
-                                  mlir::Operation *op, mlir::OperandRange tiles,
-                                  BDDimLayoutArrayArrayAttr dimsPerTileAttr);
+[[maybe_unused]] void printObjectFifoConsumerTiles(
+    mlir::OpAsmPrinter &printer, mlir::Operation *op, mlir::OperandRange tiles,
+    BDDimLayoutArrayArrayAttr dimsPerTileAttr);
 
 TileOp getTileOp(mlir::Operation &op);
 int32_t getBufferElementTypeWidthInBytes(DMABDOp &op);
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEEnums.h b/compiler/plugins/target/AMD-AIE/aie/AIEEnums.h
index b1628d732..6280ecf84 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AIEEnums.h
+++ b/compiler/plugins/target/AMD-AIE/aie/AIEEnums.h
@@ -10,6 +10,8 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 
+// clang-format off: must include AIEEnums.h.inc after the above includes
 #include "aie/AIEEnums.h.inc"
+// clang-format on
 
 #endif
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEOps.td b/compiler/plugins/target/AMD-AIE/aie/AIEOps.td
index 9b8691810..698afe7b6 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/aie/AIEOps.td
@@ -175,7 +175,7 @@ def AIE_FlowOp: AIE_Op<"flow"> {
   }];
   let builders = [
     OpBuilder<(
-      ins "::mlir::Value":$source, 
+      ins "::mlir::Value":$source,
           "::mlir::iree_compiler::AMDAIE::StrmSwPortType":$source_bundle,
           "uint8_t":$source_channel,
           "::mlir::Value":$dest,
diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIENormalizeAddressSpaces.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIENormalizeAddressSpaces.cpp
index 6d9b6b030..eaebb81eb 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AMDAIENormalizeAddressSpaces.cpp
+++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIENormalizeAddressSpaces.cpp
@@ -99,4 +99,4 @@ void registerAMDAIENormalizeAddressSpaces() {
   });
 }
 
-}  // namespace mlir::iree_compiler::AMDAIE
\ No newline at end of file
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/aie2_memtile_connection.mlir b/compiler/plugins/target/AMD-AIE/aie/test/aie2_memtile_connection.mlir
index c5368e606..8106370c1 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/aie2_memtile_connection.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/aie2_memtile_connection.mlir
@@ -35,9 +35,9 @@ module {
   %tile_0_0 = aie.tile(0, 0)
   %tile_0_1 = aie.tile(0, 1)
   %tile_0_2 = aie.tile(0, 2)
-  aie.flow(%tile_0_1, DMA : 0, %tile_0_0, DMA : 0) 
-  aie.packet_flow(0) { 
-    aie.packet_source<%tile_0_2, DMA : 0> 
+  aie.flow(%tile_0_1, DMA : 0, %tile_0_0, DMA : 0)
+  aie.packet_flow(0) {
+    aie.packet_source<%tile_0_2, DMA : 0>
     aie.packet_dest<%tile_0_0, DMA : 1>
   }
  }
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/local_locks.mlir b/compiler/plugins/target/AMD-AIE/aie/test/local_locks.mlir
index 07082bf67..4aa3fa2e7 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/local_locks.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/local_locks.mlir
@@ -22,4 +22,3 @@ module @local_locks {
   }
  }
 }
-
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/normalize_call_op.mlir b/compiler/plugins/target/AMD-AIE/aie/test/normalize_call_op.mlir
index 53d74cb85..560856e70 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/normalize_call_op.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/normalize_call_op.mlir
@@ -19,4 +19,4 @@ module @aie attributes {llvm.target_triple = "aie"} {
   }
   func.func private @external_function(memref<1024xi32, 2>)
  }
-}
\ No newline at end of file
+}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/test_congestion0.mlir b/compiler/plugins/target/AMD-AIE/aie/test/test_congestion0.mlir
index 0651e04db..f3ef0e2e0 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/test_congestion0.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/test_congestion0.mlir
@@ -111,20 +111,20 @@ module {
     %tile_0_4 = aie.tile(0, 4)
     %tile_0_5 = aie.tile(0, 5)
 
-    aie.packet_flow(0) { 
-      aie.packet_source<%tile_0_2, DMA : 0> 
+    aie.packet_flow(0) {
+      aie.packet_source<%tile_0_2, DMA : 0>
       aie.packet_dest<%tile_0_1, DMA : 0>
     }
-    aie.packet_flow(1) { 
-      aie.packet_source<%tile_0_3, DMA : 0> 
+    aie.packet_flow(1) {
+      aie.packet_source<%tile_0_3, DMA : 0>
       aie.packet_dest<%tile_0_1, DMA : 1>
     }
-    aie.packet_flow(2) { 
-      aie.packet_source<%tile_0_4, DMA : 0> 
+    aie.packet_flow(2) {
+      aie.packet_source<%tile_0_4, DMA : 0>
       aie.packet_dest<%tile_0_1, DMA : 2>
     }
-    aie.packet_flow(3) { 
-      aie.packet_source<%tile_0_5, DMA : 0> 
+    aie.packet_flow(3) {
+      aie.packet_source<%tile_0_5, DMA : 0>
       aie.packet_dest<%tile_0_1, DMA : 3>
     }
   }
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/test_create_packet_flows_shim0.mlir b/compiler/plugins/target/AMD-AIE/aie/test/test_create_packet_flows_shim0.mlir
index 30d0e7e91..4ea4e4f6e 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/test_create_packet_flows_shim0.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/test_create_packet_flows_shim0.mlir
@@ -22,7 +22,7 @@
 // CHECK:   }
 
 //
-// one-to-one shim DMA destination 
+// one-to-one shim DMA destination
 //
 module @aie_module  {
  aie.device(xcvc1902) {
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/test_create_packet_flows_shim1.mlir b/compiler/plugins/target/AMD-AIE/aie/test/test_create_packet_flows_shim1.mlir
index 93b643fa4..8aca7081f 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/test_create_packet_flows_shim1.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/test_create_packet_flows_shim1.mlir
@@ -22,7 +22,7 @@
 // CHECK:   }
 
 //
-// one-to-one shim DMA source 
+// one-to-one shim DMA source
 //
 module @aie_module  {
  aie.device(xcvc1902) {
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/trace_packet_routing.mlir b/compiler/plugins/target/AMD-AIE/aie/test/trace_packet_routing.mlir
index 76d55e212..e49083311 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/trace_packet_routing.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/trace_packet_routing.mlir
@@ -1,7 +1,7 @@
 // RUN: iree-opt --amdaie-create-pathfinder-flows %s | FileCheck %s
 
 // CHECK-LABEL: module @trace_packet_routing {
-  
+
 module @trace_packet_routing {
  aie.device(npu1_4col) {
   %tile_0_0 = aie.tile(0, 0)
@@ -9,11 +9,11 @@ module @trace_packet_routing {
   %tile_0_2 = aie.tile(0, 2)
   %tile_0_3 = aie.tile(0, 3)
 
-  aie.packet_flow(0) { 
+  aie.packet_flow(0) {
     aie.packet_source<%tile_0_2, TRACE : 0> // core trace
     aie.packet_dest<%tile_0_0, DMA : 1>
   } {keep_pkt_header = true}
-  aie.packet_flow(1) { 
+  aie.packet_flow(1) {
     aie.packet_source<%tile_0_3, TRACE : 0> // core trace
     aie.packet_dest<%tile_1_0, DMA : 1>
   } {keep_pkt_header = true}
diff --git a/compiler/plugins/target/AMD-AIE/aie/test/unit_simple_flows_shim.mlir b/compiler/plugins/target/AMD-AIE/aie/test/unit_simple_flows_shim.mlir
index 4d667eebf..9b4c77179 100644
--- a/compiler/plugins/target/AMD-AIE/aie/test/unit_simple_flows_shim.mlir
+++ b/compiler/plugins/target/AMD-AIE/aie/test/unit_simple_flows_shim.mlir
@@ -49,5 +49,3 @@ module {
     aie.flow(%t20, DMA : 0, %t30, DMA : 1)
   }
 }
-
-
diff --git a/compiler/plugins/target/AMD-AIE/aievec/AIEVecOps.cpp b/compiler/plugins/target/AMD-AIE/aievec/AIEVecOps.cpp
index d1d578626..7bde52a01 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/AIEVecOps.cpp
+++ b/compiler/plugins/target/AMD-AIE/aievec/AIEVecOps.cpp
@@ -605,7 +605,6 @@ ParseResult FMAElemOp::parse(OpAsmParser &parser, OperationState &result) {
   return parseMulFMAElemOp(parser, result, true);
 }
 
-
 //===----------------------------------------------------------------------===//
 // ExtOp
 //===----------------------------------------------------------------------===//
@@ -627,20 +626,21 @@ LogicalResult ExtOp::verify() {
   // Verify the types
   VectorType sourceType = llvm::dyn_cast<VectorType>(getSource().getType());
   VectorType resultType = llvm::dyn_cast<VectorType>(getResult().getType());
-  if (!sourceType || !resultType)
-    return emitError("requires vector type");
+  if (!sourceType || !resultType) return emitError("requires vector type");
 
   // Check the number of lanes
   unsigned sourceLanes = getVectorLaneSize(sourceType);
   unsigned resultLanes = getVectorLaneSize(resultType);
   // Source lanes must be greater than result lanes
   if (sourceLanes / resultLanes <= 1)
-    return emitError("lanes in source vector must be at least "
-                     "twice that of result vector");
+    return emitError(
+        "lanes in source vector must be at least "
+        "twice that of result vector");
   // Source lanes must be a multiple of result lanes
   if (sourceLanes % resultLanes != 0)
-    return emitError("lanes in result vector must be a multiple "
-                     "of source vector lanes");
+    return emitError(
+        "lanes in result vector must be a multiple "
+        "of source vector lanes");
 
   // Verify validity of index
   unsigned factor = sourceLanes / resultLanes;
@@ -663,8 +663,7 @@ ParseResult ExtOp::parse(OpAsmParser &parser, OperationState &result) {
   OpAsmParser::UnresolvedOperand source;
 
   // Parse the source vector
-  if (parser.parseOperand(source))
-    return failure();
+  if (parser.parseOperand(source)) return failure();
 
   // Parse all the attributes and types
   if (parser.parseOptionalAttrDict(result.attributes) ||
@@ -715,15 +714,13 @@ void ShiftOp::print(OpAsmPrinter &p) {
 LogicalResult ShiftOp::verify() {
   // Verify the types
   VectorType resultType = llvm::dyn_cast<VectorType>(getResult().getType());
-  if (!resultType)
-    return emitError("requires vector type");
+  if (!resultType) return emitError("requires vector type");
 
   // lhs, rhs and result must have the same type
   VectorType lhsType = llvm::dyn_cast<VectorType>(getLhs().getType());
   VectorType rhsType = llvm::dyn_cast<VectorType>(getRhs().getType());
 
-  if (!lhsType || !rhsType)
-    return emitError("requires vector type");
+  if (!lhsType || !rhsType) return emitError("requires vector type");
   if (lhsType != resultType || rhsType != resultType)
     return emitError("All vectors must have same type");
 
@@ -765,8 +762,7 @@ ParseResult ShiftOp::parse(OpAsmParser &parser, OperationState &result) {
   if (!lhsType || !rhsType || !resultType)
     return parser.emitError(typesLoc, "requires vector type");
 
-  if (!shiftType)
-    return parser.emitError(typesLoc, "requires integer type");
+  if (!shiftType) return parser.emitError(typesLoc, "requires integer type");
 
   // Populate the lhs vector, rhs vectors and shift in result
   if (parser.resolveOperand(lhs, lhsType, result.operands) ||
diff --git a/compiler/plugins/target/AMD-AIE/aievec/AIEVecToLLVM.cpp b/compiler/plugins/target/AMD-AIE/aievec/AIEVecToLLVM.cpp
index 226947f83..718777f12 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/AIEVecToLLVM.cpp
+++ b/compiler/plugins/target/AMD-AIE/aievec/AIEVecToLLVM.cpp
@@ -798,12 +798,12 @@ class ShuffleOpConversion
 };
 
 class ShiftOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::ShiftOp> {
-public:
+ public:
   using ConvertOpToLLVMPattern<aievec::ShiftOp>::ConvertOpToLLVMPattern;
 
-  LogicalResult
-  matchAndRewrite(aievec::ShiftOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
+  LogicalResult matchAndRewrite(
+      aievec::ShiftOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
 
     Value result = op.getResult();
@@ -856,12 +856,12 @@ class ShiftOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::ShiftOp> {
 };
 
 class ExtOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::ExtOp> {
-public:
+ public:
   using ConvertOpToLLVMPattern<aievec::ExtOp>::ConvertOpToLLVMPattern;
 
-  LogicalResult
-  matchAndRewrite(aievec::ExtOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
+  LogicalResult matchAndRewrite(
+      aievec::ExtOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
 
     Value src = adaptor.getSource();
@@ -955,7 +955,6 @@ class ExtOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::ExtOp> {
   }
 };
 
-
 void populateAIEVecToLLVMConversionPatterns(mlir::LLVMTypeConverter &converter,
                                             mlir::RewritePatternSet &patterns) {
   patterns.add<UPSOpConversion, SRSOpConversion, FoldAIECastOps,
diff --git a/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp b/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
index 4f2ee3d0b..20ca03763 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
+++ b/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
@@ -1175,8 +1175,7 @@ FailureOr<Value> getAlignedTransferRead(
     // aligned, and we just couldn't prove it.
     readOp.emitWarning() << "`transfer_read` doesn't have a vector with "
                          << shiftOperandBits / 2 << " or " << shiftOperandBits
-                         << " bits."
-                         << "This case is not currently handled.";
+                         << " bits." << "This case is not currently handled.";
     return readOp.getVector();
   }
 
diff --git a/compiler/plugins/target/AMD-AIE/aievec/XLLVMDialect.h b/compiler/plugins/target/AMD-AIE/aievec/XLLVMDialect.h
index bd6ad37b6..a5455ef1b 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/XLLVMDialect.h
+++ b/compiler/plugins/target/AMD-AIE/aievec/XLLVMDialect.h
@@ -38,8 +38,8 @@
 #include "mlir/Transforms/Mem2Reg.h"
 
 #define GET_OP_CLASSES
-#include "aievec/XLLVMDialect.h.inc"
 #include "aievec/XLLVMAIE2IntrOps.h.inc"
+#include "aievec/XLLVMDialect.h.inc"
 
 namespace llvm {
 
diff --git a/compiler/plugins/target/AMD-AIE/aievec/test/test-srs.mlir b/compiler/plugins/target/AMD-AIE/aievec/test/test-srs.mlir
index 575c8eb9f..faf075780 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/test/test-srs.mlir
+++ b/compiler/plugins/target/AMD-AIE/aievec/test/test-srs.mlir
@@ -15,12 +15,12 @@ func.func @v32i16_srs_v32i32(%arg0 : vector<32xi32>) {
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[BITCAST0:.*]] = llvm.bitcast %[[ARG0]] : vector<32xi32> to vector<16xi64>
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.I512.v32.acc32.srs"(
-// CHECK-SAME: [[BITCAST0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[BITCAST0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<16xi64>, i32, i32) -> vector<32xi16>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[BITCAST1:.*]] = llvm.bitcast %[[ARG0]] : vector<32xi32> to vector<16xi64>
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.I512.v32.acc32.srs"(
-// CHECK-SAME: [[BITCAST1]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[BITCAST1]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<16xi64>, i32, i32) -> vector<32xi16>
 
 // -----
@@ -39,11 +39,11 @@ func.func @v16i32_srs_v16i64(%arg0 : vector<16xi64>) {
 // CHECK-NEXT: %[[SHIFT5:.*]] = arith.constant 5 : i32
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.I512.v16.acc64.srs"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<16xi64>, i32, i32) -> vector<16xi32>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.I512.v16.acc64.srs"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<16xi64>, i32, i32) -> vector<16xi32>
 
 // -----
@@ -63,12 +63,12 @@ func.func @v16i16_srs_v16i32(%arg0 : vector<16xi32>) {
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[BITCAST0:.*]] = llvm.bitcast %[[ARG0]] : vector<16xi32> to vector<8xi64>
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.I256.v16.acc32.srs"(
-// CHECK-SAME: [[BITCAST0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[BITCAST0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<8xi64>, i32, i32) -> vector<16xi16>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[BITCAST1:.*]] = llvm.bitcast %[[ARG0]] : vector<16xi32> to vector<8xi64>
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.I256.v16.acc32.srs"(
-// CHECK-SAME: [[BITCAST1]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[BITCAST1]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<8xi64>, i32, i32) -> vector<16xi16>
 
 // -----
@@ -88,12 +88,12 @@ func.func @v32i8_srs_v32i32(%arg0 : vector<32xi32>) {
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[BITCAST0:.*]] = llvm.bitcast %[[ARG0]] : vector<32xi32> to vector<16xi64>
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.I256.v32.acc32.srs"(
-// CHECK-SAME: [[BITCAST0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[BITCAST0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<16xi64>, i32, i32) -> vector<32xi8>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[BITCAST1:.*]] = llvm.bitcast %[[ARG0]] : vector<32xi32> to vector<16xi64>
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.I256.v32.acc32.srs"(
-// CHECK-SAME: [[BITCAST1]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[BITCAST1]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<16xi64>, i32, i32) -> vector<32xi8>
 
 // -----
@@ -112,11 +112,11 @@ func.func @v16i16_srs_v16i64(%arg0 : vector<16xi64>) {
 // CHECK-NEXT: %[[SHIFT5:.*]] = arith.constant 5 : i32
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.I256.v16.acc64.srs"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<16xi64>, i32, i32) -> vector<16xi16>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.I256.v16.acc64.srs"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<16xi64>, i32, i32) -> vector<16xi16>
 
 // -----
@@ -135,11 +135,11 @@ func.func @v8i32_srs_v8i64(%arg0 : vector<8xi64>) {
 // CHECK-NEXT: %[[SHIFT5:.*]] = arith.constant 5 : i32
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.I256.v8.acc64.srs"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<8xi64>, i32, i32) -> vector<8xi32>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.I256.v8.acc64.srs"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<8xi64>, i32, i32) -> vector<8xi32>
 
 // -----
@@ -158,11 +158,11 @@ func.func @v16bf16_srs_v16f32(%arg0 : vector<16xf32>) {
 // CHECK-NEXT: %[[SHIFT5:.*]] = arith.constant 5 : i32
 // CHECK-NEXT: %[[BITCAST0:.*]] = llvm.bitcast %[[ARG0]] : vector<16xf32> to vector<8xi64>
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.v16accfloat.to.v16bf16"(
-// CHECK-SAME: [[BITCAST0]]) : 
+// CHECK-SAME: [[BITCAST0]]) :
 // CHECK-SAME: (vector<8xi64>) -> vector<16xbf16>
 // CHECK-NEXT: %[[BITCAST1:.*]] = llvm.bitcast %[[ARG0]] : vector<16xf32> to vector<8xi64>
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.v16accfloat.to.v16bf16"(
-// CHECK-SAME: [[BITCAST1]]) : 
+// CHECK-SAME: [[BITCAST1]]) :
 // CHECK-SAME: (vector<8xi64>) -> vector<16xbf16>
 
 // -----
diff --git a/compiler/plugins/target/AMD-AIE/aievec/test/test-ups.mlir b/compiler/plugins/target/AMD-AIE/aievec/test/test-ups.mlir
index 98b2b7b9f..119e5dfcc 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/test/test-ups.mlir
+++ b/compiler/plugins/target/AMD-AIE/aievec/test/test-ups.mlir
@@ -3,7 +3,7 @@
 func.func @v16i32_ups_v16i16(%arg0 : vector<16xi16>) {
   %0 = aievec.ups %arg0 {shift = 0 : i8} : vector<16xi16>, vector<16xi32>
   %1 = aievec.ups %arg0 {shift = 5 : i8} : vector<16xi16>, vector<16xi32>
-  return 
+  return
 }
 
 // CHECK-LABEL: @v16i32_ups_v16i16
@@ -11,13 +11,13 @@ func.func @v16i32_ups_v16i16(%arg0 : vector<16xi16>) {
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT0:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.acc32.v16.I256.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<16xi16>, i32, i32) -> vector<8xi64>
 // CHECK-NEXT: %[[BITCAST0:.*]] = llvm.bitcast %[[SRS0]] : vector<8xi64> to vector<16xi32>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT5:.*]] = llvm.mlir.constant(5 : i32) : i32
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.acc32.v16.I256.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<16xi16>, i32, i32) -> vector<8xi64>
 // CHECK-NEXT: %[[BITCAST1:.*]] = llvm.bitcast %[[SRS1]] : vector<8xi64> to vector<16xi32>
 
@@ -26,7 +26,7 @@ func.func @v16i32_ups_v16i16(%arg0 : vector<16xi16>) {
 func.func @v8acc64_ups_v8i32(%arg0 : vector<8xi32>) {
   %0 = aievec.ups %arg0 {shift = 0 : i8} : vector<8xi32>, vector<8xi64>
   %1 = aievec.ups %arg0 {shift = 5 : i8} : vector<8xi32>, vector<8xi64>
-  return 
+  return
 }
 
 // CHECK-LABEL: @v8acc64_ups_v8i32
@@ -34,12 +34,12 @@ func.func @v8acc64_ups_v8i32(%arg0 : vector<8xi32>) {
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT0:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.acc64.v8.I256.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<8xi32>, i32, i32) -> vector<8xi64>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT5:.*]] = llvm.mlir.constant(5 : i32) : i32
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.acc64.v8.I256.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<8xi32>, i32, i32) -> vector<8xi64>
 
 // -----
@@ -47,7 +47,7 @@ func.func @v8acc64_ups_v8i32(%arg0 : vector<8xi32>) {
 func.func @v32i32_ups_v32i16(%arg0 : vector<32xi16>) {
   %0 = aievec.ups %arg0 {shift = 0 : i8} : vector<32xi16>, vector<32xi32>
   %1 = aievec.ups %arg0 {shift = 5 : i8} : vector<32xi16>, vector<32xi32>
-  return 
+  return
 }
 
 // CHECK-LABEL: @v32i32_ups_v32i16
@@ -55,13 +55,13 @@ func.func @v32i32_ups_v32i16(%arg0 : vector<32xi16>) {
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT0:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.acc32.v32.I512.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<32xi16>, i32, i32) -> vector<16xi64>
 // CHECK-NEXT: %[[BITCAST0:.*]] = llvm.bitcast %[[SRS0]] : vector<16xi64> to vector<32xi32>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT5:.*]] = llvm.mlir.constant(5 : i32) : i32
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.acc32.v32.I512.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<32xi16>, i32, i32) -> vector<16xi64>
 // CHECK-NEXT: %[[BITCAST1:.*]] = llvm.bitcast %[[SRS1]] : vector<16xi64> to vector<32xi32>
 
@@ -70,7 +70,7 @@ func.func @v32i32_ups_v32i16(%arg0 : vector<32xi16>) {
 func.func @v16acc64_ups_v16i32(%arg0 : vector<16xi32>) {
   %0 = aievec.ups %arg0 {shift = 0 : i8} : vector<16xi32>, vector<16xi64>
   %1 = aievec.ups %arg0 {shift = 5 : i8} : vector<16xi32>, vector<16xi64>
-  return 
+  return
 }
 
 // CHECK-LABEL: @v16acc64_ups_v16i32
@@ -78,12 +78,12 @@ func.func @v16acc64_ups_v16i32(%arg0 : vector<16xi32>) {
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT0:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.acc64.v16.I512.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<16xi32>, i32, i32) -> vector<16xi64>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT5:.*]] = llvm.mlir.constant(5 : i32) : i32
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.acc64.v16.I512.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<16xi32>, i32, i32) -> vector<16xi64>
 
 // -----
@@ -91,7 +91,7 @@ func.func @v16acc64_ups_v16i32(%arg0 : vector<16xi32>) {
 func.func @v16acc64_ups_v16i16(%arg0 : vector<16xi16>) {
   %0 = aievec.ups %arg0 {shift = 0 : i8} : vector<16xi16>, vector<16xi64>
   %1 = aievec.ups %arg0 {shift = 5 : i8} : vector<16xi16>, vector<16xi64>
-  return 
+  return
 }
 
 // CHECK-LABEL: @v16acc64_ups_v16i16
@@ -99,12 +99,12 @@ func.func @v16acc64_ups_v16i16(%arg0 : vector<16xi16>) {
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT0:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.acc64.v16.I256.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<16xi16>, i32, i32) -> vector<16xi64>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT5:.*]] = llvm.mlir.constant(5 : i32) : i32
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.acc64.v16.I256.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<16xi16>, i32, i32) -> vector<16xi64>
 
 // -----
@@ -112,7 +112,7 @@ func.func @v16acc64_ups_v16i16(%arg0 : vector<16xi16>) {
 func.func @v32i32_ups_v32i8(%arg0 : vector<32xi8>) {
   %0 = aievec.ups %arg0 {shift = 0 : i8} : vector<32xi8>, vector<32xi32>
   %1 = aievec.ups %arg0 {shift = 5 : i8} : vector<32xi8>, vector<32xi32>
-  return 
+  return
 }
 
 // CHECK-LABEL: @v32i32_ups_v32i8
@@ -120,39 +120,39 @@ func.func @v32i32_ups_v32i8(%arg0 : vector<32xi8>) {
 // CHECK-NEXT: %[[SIGN0:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT0:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.acc32.v32.I256.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT0]], %[[SIGN0]]) :
 // CHECK-SAME: (vector<32xi8>, i32, i32) -> vector<16xi64>
 // CHECK-NEXT: %[[BITCAST0:.*]] = llvm.bitcast %[[SRS0]] : vector<16xi64> to vector<32xi32>
 // CHECK-NEXT: %[[SIGN1:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK-NEXT: %[[SHIFT5:.*]] = llvm.mlir.constant(5 : i32) : i32
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.acc32.v32.I256.ups"(
-// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) : 
+// CHECK-SAME: [[ARG0]], %[[SHIFT5]], %[[SIGN1]]) :
 // CHECK-SAME: (vector<32xi8>, i32, i32) -> vector<16xi64>
 // CHECK-NEXT: %[[BITCAST1:.*]] = llvm.bitcast %[[SRS1]] : vector<16xi64> to vector<32xi32>
 
 // -----
 
 func.func @v16f32_ups_v16bf16(%arg0 : vector<16xbf16>) {
-  %0 = aievec.ups %arg0 {shift = 0 : i8} : vector<16xbf16>, vector<16xf32> 
-  %1 = aievec.ups %arg0 {shift = 5 : i8} : vector<16xbf16>, vector<16xf32> 
+  %0 = aievec.ups %arg0 {shift = 0 : i8} : vector<16xbf16>, vector<16xf32>
+  %1 = aievec.ups %arg0 {shift = 5 : i8} : vector<16xbf16>, vector<16xf32>
   return
 }
 
 // CHECK-LABEL: @v16f32_ups_v16bf16
 // CHECK-SAME: %[[ARG0:.*]]: vector<16xbf16>
 // CHECK-NEXT: %[[SRS0:.*]] = "xllvm.intr.aie2.v16bf16.to.v16accfloat"(
-// CHECK-SAME: [[ARG0]]) : 
+// CHECK-SAME: [[ARG0]]) :
 // CHECK-SAME: (vector<16xbf16>) -> vector<8xi64>
-// CHECK-NEXT: %[[BITCAST0:.*]] = llvm.bitcast %[[SRS0]] : vector<8xi64> to vector<16xf32> 
+// CHECK-NEXT: %[[BITCAST0:.*]] = llvm.bitcast %[[SRS0]] : vector<8xi64> to vector<16xf32>
 // CHECK-NEXT: %[[SRS1:.*]] = "xllvm.intr.aie2.v16bf16.to.v16accfloat"(
-// CHECK-SAME: [[ARG0]]) : 
+// CHECK-SAME: [[ARG0]]) :
 // CHECK-SAME: (vector<16xbf16>) -> vector<8xi64>
-// CHECK-NEXT: %[[BITCAST1:.*]] = llvm.bitcast %[[SRS1]] : vector<8xi64> to vector<16xf32> 
+// CHECK-NEXT: %[[BITCAST1:.*]] = llvm.bitcast %[[SRS1]] : vector<8xi64> to vector<16xf32>
 
 // -----
 
 func.func @v32f32_ups_v32bf16(%arg0 : vector<32xbf16>) {
-  %0 = aievec.ups %arg0 {shift = 0 : i8} : vector<32xbf16>, vector<32xf32> 
+  %0 = aievec.ups %arg0 {shift = 0 : i8} : vector<32xbf16>, vector<32xf32>
   return
 }
 
diff --git a/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt
index 8dfe5a24c..7e283c021 100644
--- a/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt
@@ -215,7 +215,7 @@ replace_string_in_file(
   ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIEPass.cpp
   "AIE::getTargetModel(*device)"
   "getDeviceModel(*device)")
-  
+
 replace_string_in_file(
   ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIESchedulingUtils.cpp
   "targetModel.hasProperty(AIE::AIETargetModel::UsesSemaphoreLocks)"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td
index 9b38f3bfa..c62bda857 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td
@@ -22,7 +22,7 @@ def AMDAIE_ConnectionType: I32EnumAttr<"ConnectionType",
   let genSpecializedAttr = 0;
 }
 
-def AMDAIE_ConnectionTypeAttr 
+def AMDAIE_ConnectionTypeAttr
   : EnumAttr<AMDAIE_Dialect, AMDAIE_ConnectionType, "connection_type">;
 
 def AMDAIE_CopyOpOperateOn: I32EnumAttr<"CopyOpOperateOn",
@@ -40,7 +40,7 @@ def AMDAIE_LockAction: I32EnumAttr<"LockAction",
   [
     I32EnumAttrCase<"Acquire", 0>,
     I32EnumAttrCase<"AcquireGreaterOrEqual", 1>,
-    I32EnumAttrCase<"Release", 2> 
+    I32EnumAttrCase<"Release", 2>
   ]
   > {
   let cppNamespace = "mlir::iree_compiler::AMDAIE";
@@ -88,7 +88,7 @@ def AMDAIE_PermLevelAttr :
   let parameters = (ins
     ArrayRefParameter<"int64_t","">:$perm
   );
-  
+
   let assemblyFormat = [{
     `[` $perm `]`
   }];
@@ -116,7 +116,7 @@ def AMDAIE_PackingConfigPackingLevelAttr :
     AttrParameter<"PermLevelsAttr",
         "Attributes for outer dimension permutation">:$outerPerm
   );
-  
+
   let assemblyFormat = [{
     `{` `packedSizes` `=` `[` $packedSizes `]` `,` `transposePackIndices` `=` `[` $transposePackIndices `]` `,` `unpackEmpty` `=` `[` $unpackEmpty `]` `,` `innerPerm` `=` $innerPerm `,` `outerPerm` `=` $outerPerm `}`
   }];
@@ -150,7 +150,7 @@ def AMDAIE_PackingConfigAttr :
     AttrParameter<"PackingConfigPackingLevelsAttr",
         "The packing config at different levels">:$packingLevels
   );
-  
+
   let extraClassDeclaration = [{
     // Returns the packing config for a level set for the op.
     PackingConfigPackingLevelAttr getPackingConfigVals(unsigned level);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDialect.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDialect.td
index 6a764cba4..27b3af71d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDialect.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDialect.td
@@ -41,7 +41,7 @@ def AMDAIE_Dialect : Dialect {
 // Trait to indicate that an operation is a circular DMA, meaning that the DMA
 // will be synchronized so that it will keep running its configuration
 // indefinitely.
-def AMDAIE_CircularDmaOp 
+def AMDAIE_CircularDmaOp
   : NativeOpTrait<"iree_compiler::AMDAIE::CircularDmaOp">;
 
 #endif // IREE_AMD_AIE_DIALECT_IREEAMDAIE_DIALECT
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.cpp
index e7ea90755..c33a1aff0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.cpp
@@ -4,10 +4,10 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <numeric>
-
 #include "iree-amd-aie/IR/AMDAIEDmaOpInterface.h"
 
+#include <numeric>
+
 #include "iree-amd-aie/IR/AMDAIEAttrs.h"
 
 /// Include the definitions of the dma-like interfaces.
@@ -71,7 +71,6 @@ std::optional<int64_t> getSourceStaticBaseOffset(DoublyStridedOpInterface op) {
   return getStaticBaseOffset<CopyOpOperateOn::Source>(op);
 }
 
-
 std::optional<int64_t> getSourceStaticSize(DoublyStridedOpInterface op) {
   return getStaticSize<CopyOpOperateOn::Source>(op);
 }
@@ -82,7 +81,6 @@ std::optional<int64_t> getTargetStaticBaseOffset(DoublyStridedOpInterface op) {
   return getStaticBaseOffset<CopyOpOperateOn::Target>(op);
 }
 
-
 std::optional<int64_t> getTargetStaticSize(DoublyStridedOpInterface op) {
   return getStaticSize<CopyOpOperateOn::Target>(op);
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.h
index 4cdc11242..42a215ddc 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.h
@@ -24,7 +24,6 @@ namespace detail {
 /// Otherwise, returns nullopt.
 std::optional<int64_t> getSourceStaticBaseOffset(DoublyStridedOpInterface op);
 
-
 /// Return the static size of the access on the source side if it can be
 /// computed. Otherwise, returns nullopt.
 std::optional<int64_t> getSourceStaticSize(DoublyStridedOpInterface op);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.td
index c5d70a718..655f8b01c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEDmaOpInterface.td
@@ -40,7 +40,7 @@ def DoublyStridedOpInterface : OpInterface<"DoublyStridedOpInterface"> {
 
     The invariants of this interface are:
       1. `offsets`, `sizes` and `strides` have the same length.
-      2. if an entry of static access pattern operands is equal to a special sentinel value, 
+      2. if an entry of static access pattern operands is equal to a special sentinel value,
          namely `ShapedType::kDynamic`, then the corresponding entry is a dynamic value.
       3. `offsets`, `sizes` and `strides` operands are non-negative.
   }];
@@ -67,7 +67,7 @@ def DoublyStridedOpInterface : OpInterface<"DoublyStridedOpInterface"> {
       /*defaultImplementation=*/[{
         std::optional<::mlir::Attribute> memSpace = getTargetMemorySpace();
         if (!memSpace) return std::nullopt;
-        return memSpace.value() ? 
+        return memSpace.value() ?
           cast<::mlir::IntegerAttr>(memSpace.value()).getInt() : 0;
       }]
     >,
@@ -221,7 +221,7 @@ def DoublyStridedOpInterface : OpInterface<"DoublyStridedOpInterface"> {
       /*defaultImplementation=*/[{
         std::optional<::mlir::Attribute> memSpace = getSourceMemorySpace();
         if (!memSpace) return std::nullopt;
-        return memSpace.value() ? 
+        return memSpace.value() ?
           cast<::mlir::IntegerAttr>(memSpace.value()).getInt() : 0;
       }]
     >,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td
index 15213b189..909b6db02 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td
@@ -100,7 +100,7 @@ def LogicalObjFifoOpInterface : OpInterface<"LogicalObjFifoOpInterface"> {
     >,
     InterfaceMethod<
       /*desc=*/[{
-        A utility to replace this logical objectFifo operation with a new one with new tiles. 
+        A utility to replace this logical objectFifo operation with a new one with new tiles.
       }],
       /*retTy=*/"::llvm::FailureOr<::mlir::iree_compiler::AMDAIE::LogicalObjFifoOpInterface>",
       /*methodName=*/"replaceWithNewTiles",
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index 17d2d604d..535b5e0a2 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -41,9 +41,9 @@ def AMDAIE_ControlCodeOp : AMDAIE_Op<"controlcode", [HasParent<"WorkgroupOp">,
     in a single block. Therefore, this operation is expected to always have a
     workgroup parent and is expected to be the terminator of this parent.
   }];
-  
+
   let regions = (region SizedRegion<1>:$region);
-  
+
   let assemblyFormat = [{ regions attr-dict }];
 
   let hasVerifier = 1;
@@ -70,9 +70,9 @@ def AMDAIE_CoreOp: AMDAIE_Op<"core", [SingleBlock, AttrSizedOperandSegments]>, R
   );
 
   let regions = (region SizedRegion<1>:$region);
-  
+
   let assemblyFormat = [{ `(` $tile `,` `in` `:` `[` $input_dmas `]` `,` `out` `:` `[` $output_dmas `]` `)` regions attr-dict }];
-  
+
   let builders = [
     OpBuilder<(ins "mlir::Value":$coreCol, "mlir::Value":$coreRow)>,
     OpBuilder<(ins "mlir::Value":$coreCol, "mlir::Value":$coreRow,
@@ -92,14 +92,14 @@ def AMDAIE_EndOp: AMDAIE_Op<"end", [Terminator]> {
   let assemblyFormat = [{ attr-dict }];
 }
 
-def AMDAIE_FlowOp: AMDAIE_Op<"flow", [AttrSizedOperandSegments]>, 
+def AMDAIE_FlowOp: AMDAIE_Op<"flow", [AttrSizedOperandSegments]>,
     Results<(outs Index)> {
   let summary = "The data connection between a set of source and target "
                 "channels.";
   let description = [{
-    This operation represents a connection between source and target channels. 
-    This is used to describe a logical data routing configuration between 
-    channels, to be solved by the router for actual stream switch 
+    This operation represents a connection between source and target channels.
+    This is used to describe a logical data routing configuration between
+    channels, to be solved by the router for actual stream switch
     configurations that implements it. The multiple sources and targets can
     describe different connection patterns:
     - Single source and multiple targets describes a data broadcasting pattern.
@@ -112,7 +112,7 @@ def AMDAIE_FlowOp: AMDAIE_Op<"flow", [AttrSizedOperandSegments]>,
     %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
     %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
     %channel_2 = amdaie.channel(%tile_1_1, 0, port_type = DMA)
-    %0 = amdaie.flow({%channel} -> {%channel_1, %channel_2}) 
+    %0 = amdaie.flow({%channel} -> {%channel_1, %channel_2})
       {is_packet_flow = true, packet_id = 0 : ui8}
     ```
   }];
@@ -176,12 +176,12 @@ def AMDAIE_WorkgroupOp : AMDAIE_Op<"workgroup",
     The invariant of this op is that the workgroup contains a single block, of
     which the terminator should be a `ControlCodeOp`.
   }];
-  
+
   let regions = (region SizedRegion<1>:$region);
   let arguments = (
     ins OptionalAttr<Builtin_DenseResourceElementsAttr>:$npu_instructions
   );
-  
+
   let assemblyFormat = [{ regions attr-dict }];
 
   // Skip the default builders and provide a custom implementation which ensures
@@ -202,7 +202,7 @@ def AMDAIE_WorkgroupOp : AMDAIE_Op<"workgroup",
     static void ensureTerminator(Region &region, OpBuilder &builder,
                                  Location loc);
   }];
-  
+
   let hasVerifier = 1;
 }
 
@@ -217,7 +217,7 @@ def AMDAIE_BufferOp: AMDAIE_Op<"buffer", [
   let description = [{
     This operation represents a buffer on an AIE tile. The buffer can have an
     optional address, indicating the location of the buffer on the tile.
-    
+
     Example:
 
     ```mlir
@@ -233,7 +233,7 @@ def AMDAIE_BufferOp: AMDAIE_Op<"buffer", [
 
   let results = (outs AnyMemRef:$buffer);
 
-  let assemblyFormat = [{ 
+  let assemblyFormat = [{
     `(` $tile (`,` $address^)? `)` attr-dict `:` type($buffer)
   }];
 }
@@ -244,8 +244,8 @@ def AMDAIE_LockOp: AMDAIE_Op<"lock", [
   ]>, Results<(outs Index)> {
   let summary = "Represents a physical lock on an AIE tile.";
   let description = [{
-    This operation represents a lock on an AIE tile. The operation is fully 
-    specified through a tile and lock ID value, designating the exact physical 
+    This operation represents a lock on an AIE tile. The operation is fully
+    specified through a tile and lock ID value, designating the exact physical
     lock to be used. This op helps with guaranteeing/verifying correct reuse of
     the same lock. The op accepts an optional initialization value.
 
@@ -256,7 +256,7 @@ def AMDAIE_LockOp: AMDAIE_Op<"lock", [
     %lock = amdaie.lock(%tile(0), 2)
     ```
 
-    This creates a lock on tile (0, 0) with ID 0 that should be initialized to 
+    This creates a lock on tile (0, 0) with ID 0 that should be initialized to
     2.
   }];
 
@@ -273,16 +273,16 @@ def AMDAIE_UseLockOp: AMDAIE_Op<"use_lock"> {
   let summary = "Represents the use of a semaphore lock with a specified "
                 "action (acquire/release).";
   let description = [{
-    This operation represents the use of a semaphore lock with a specified lock 
-    `action` and `value`. The lock action could for example be `Acquire`, 
-    `AcquireGreaterOrEqual` or `Release`. The specified `value` argument 
+    This operation represents the use of a semaphore lock with a specified lock
+    `action` and `value`. The lock action could for example be `Acquire`,
+    `AcquireGreaterOrEqual` or `Release`. The specified `value` argument
     determines the value to be used in the lock action, for example:
     - `Acquire(1)`: Acquire the lock if its value is equal to 1, then subtract 1
       from it.
-    - `AcquireGreaterOrEqual(1)`: Acquire the lock if its value is greater or 
+    - `AcquireGreaterOrEqual(1)`: Acquire the lock if its value is greater or
       equal to 1, then subtract 1 from it.
     - `Release(1)`: Add 1 to the value of this lock.
-    
+
 
     Example:
 
@@ -298,7 +298,7 @@ def AMDAIE_UseLockOp: AMDAIE_Op<"use_lock"> {
         I8Attr:$value
   );
 
-  let assemblyFormat = [{ 
+  let assemblyFormat = [{
     `(` $lock `,` $action `(` $value `)` `)` attr-dict
   }];
 }
@@ -374,11 +374,11 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [
     TileOp getTileOp();
   }];
 
-  let assemblyFormat = [{ 
-    `(` 
+  let assemblyFormat = [{
+    `(`
     $tile `,`
-    $value `,` 
-    `port_type` `=` $port_type `,` 
+    $value `,`
+    `port_type` `=` $port_type `,`
     `direction` `=` $direction
     `)`
     attr-dict
@@ -419,7 +419,7 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
   let description = [{
     The Npu DMA operation represents a strided copy operation with an unlimited
     number of dimensions, executed by the Npu uController. This operation refers
-    to a `ConnectionOp`, which will contain the necessary information about the 
+    to a `ConnectionOp`, which will contain the necessary information about the
     source and target logical objectFifos of the operation and which will
     instantiate the DMA connection.
 
@@ -440,7 +440,7 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
     Example:
 
     ```mlir
-    %2 = amdaie.connection(%1, %0) 
+    %2 = amdaie.connection(%1, %0)
       : (!amdaie.logicalobjectfifo<memref<32x64xi32, 1>>,
       !amdaie.logicalobjectfifo<memref<32x1024xi32>>)
     ...
@@ -473,7 +473,7 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
 
   let results = (outs Variadic<AMDAIE_AnyAsyncTokenType>:$async_tokens);
 
-  // Use a custom assembly format because of weird spaces being inserted around 
+  // Use a custom assembly format because of weird spaces being inserted around
   // the optional `target` by the default assembly format generator.
   let hasCustomAssemblyFormat = 1;
 
@@ -488,7 +488,7 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
       "ArrayRef<OpFoldResult>":$source_strides, "::mlir::Value":$source_bd_id)>,
     // Build a NpuDmaCpyNdOp with static entries.
     OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
-      "::mlir::Value":$target, "ArrayRef<int64_t>":$target_offsets, 
+      "::mlir::Value":$target, "ArrayRef<int64_t>":$target_offsets,
       "ArrayRef<int64_t>":$target_sizes, "ArrayRef<int64_t>":$target_strides,
       "::mlir::Value":$target_bd_id, "::mlir::Value":$source,
       "ArrayRef<int64_t>":$source_offsets, "ArrayRef<int64_t>":$source_sizes,
@@ -505,14 +505,14 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
   let extraClassDeclaration = [{
     // Check whether this dma operation has a wait user.
     bool hasDmaWaitOpUser();
-    
+
     // Check whether this operation has addressing on the source side.
     bool hasSourceAddressing() {
-      return !getSourceMixedOffsets().empty() || !getSourceMixedSizes().empty() 
+      return !getSourceMixedOffsets().empty() || !getSourceMixedSizes().empty()
         || !getSourceMixedStrides().empty();
     }
     // Check whether this operation has addressing on the target side.
-    bool hasTargetAddressing() { 
+    bool hasTargetAddressing() {
       return !getTargetMixedOffsets().empty() || !getTargetMixedSizes().empty()
         || !getTargetMixedStrides().empty();
     }
@@ -523,7 +523,7 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
 
     // Return the source memref type. This is retrieved using information from
     // the input DMA operation.
-    MemRefType getSourceMemrefType() { 
+    MemRefType getSourceMemrefType() {
       return cast<LogicalObjectFifoType>(getConnectionOp().getSourceType())
           .getElementType();
     }
@@ -536,7 +536,7 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
 
     // Return the target memref type. This is retrieved using information from
     // the input DMA operation.
-    MemRefType getTargetMemrefType() { 
+    MemRefType getTargetMemrefType() {
       return cast<LogicalObjectFifoType>(getConnectionOp().getTargetType())
           .getElementType();
     }
@@ -558,7 +558,7 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
       if (!bdIdValue) return nullptr;
       return dyn_cast_if_present<BdIdOp>(bdIdValue.getDefiningOp());
     }
-    
+
     FailureOr<AMDAIE::ChannelOp> getSourceChannelOp();
 
     FailureOr<AMDAIE::ChannelOp> getTargetChannelOp();
@@ -590,26 +590,26 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     operand provides information on how to use the connection, for example
     whether a packet header is needed.
 
-    The representation supports a partially-static representation for the 
-    `offsets`, `sizes` and `strides`. A special sentinel value 
+    The representation supports a partially-static representation for the
+    `offsets`, `sizes` and `strides`. A special sentinel value
     ShapedType::kDynamic encodes that the corresponding entry has a dynamic
     value.
 
-    It also supports the representation of DMA BD chaining using the, 
-    `next_bd`, and `start_bd` operands. The `next_bd` operand specifies 
+    It also supports the representation of DMA BD chaining using the,
+    `next_bd`, and `start_bd` operands. The `next_bd` operand specifies
     the BD ID of the next DMA operation in the chain, if there is any.
-    
+
     The `start_bd` operand specifies the BD ID of the first DMA operation in a sequence.
     - If `start_bd` is the same as `bd_id`, it marks the start of a chain.
-    - If `start_bd` differs from `bd_id` and `next_bd` is set, it represents 
+    - If `start_bd` differs from `bd_id` and `next_bd` is set, it represents
       an intermediate operation in the chain.
-    - If `start_bd` differs from `bd_id` and `next_bd` is not set, it represents 
+    - If `start_bd` differs from `bd_id` and `next_bd` is not set, it represents
       the end of the chain.
-      
+
     Example:
 
     ```mlir
-    %2 = amdaie.connection(%1, %0) 
+    %2 = amdaie.connection(%1, %0)
       : (!amdaie.logicalobjectfifo<memref<32x64xi32, 1>>,
       !amdaie.logicalobjectfifo<memref<32x1024xi32>>)
     %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
@@ -617,7 +617,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
     ...
     amdaie.controlcode {
-      %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} 
+      %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0}
         : memref<32x1024xi32> -> !amdaie.logicalobjectfifo<memref<32768xi32>>
       %4 = amdaie.npu.half_dma_cpy_nd async %2(%0[0, 0] [32, 64] [1024, 1]
         bd_id = %bd_id_0 channel = %channel next_bd = %bd_id_1 start_bd = %bd_id_0)
@@ -656,7 +656,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     (`next_bd` `=` $next_bd^)?
     (`start_bd` `=` $start_bd^)?
     `)`
-    attr-dict 
+    attr-dict
     `:` type($input)
   }];
 
@@ -684,7 +684,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     /// Return the number of leading operands before the `offsets`, `sizes` and
     /// and `strides` operands.
     static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 2; }
-    
+
     /// Return the expected rank of each of the`static_offsets`, `static_sizes`
     /// and `static_strides` attributes.
     std::array<unsigned, 3> getArrayAttrMaxRanks() {
@@ -718,7 +718,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp
 
     // Return the source memref type. This is retrieved using information from
     // the input DMA operation.
-    MemRefType getMemrefType() { 
+    MemRefType getMemrefType() {
       return cast<LogicalObjectFifoType>(getInput().getType())
           .getElementType();
     }
@@ -729,9 +729,9 @@ def AMDAIE_NpuHalfDmaCpyNdOp
         .getMemorySpace();
     }
 
-    // Helper method to return the memory space as an integer. If no memory 
+    // Helper method to return the memory space as an integer. If no memory
     // space attribute, this indicates a global memory space and 0 is returned.
-    // Else cast the memory space attribute to an integer. 
+    // Else cast the memory space attribute to an integer.
     uint8_t getMemorySpaceAsUInt() {
       Attribute memSpace = getMemorySpace();
       return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
@@ -742,13 +742,13 @@ def AMDAIE_NpuHalfDmaCpyNdOp
 
     // Compute and return the size of the DMA access if possible.
     std::optional<int64_t> getAccessStaticSize();
-    
+
     // Check whether this dma operation has a wait user.
     bool hasDmaWaitOpUser();
-    
+
     // Check whether this operation has addressing.
     bool hasAddressing() {
-      return !getMixedOffsets().empty() || !getMixedSizes().empty() 
+      return !getMixedOffsets().empty() || !getMixedSizes().empty()
         || !getMixedStrides().empty();
     }
   }];
@@ -761,8 +761,8 @@ def AMDAIE_NpuCircularDmaCpyNdOp: AMDAIE_Op<"npu.circular_dma_cpy_nd", [
   let description = [{
     The Npu circular DMA operation represents a strided copy operation with an
     unlimited number of dimensions that will go on indefinitely. This operation
-    will configure a connection to perform that copy operation. This operation 
-    refers to a `ConnectionOp` for this connection, which will contain the 
+    will configure a connection to perform that copy operation. This operation
+    refers to a `ConnectionOp` for this connection, which will contain the
     necessary information about the source and target logical objectFifos.
 
     The representation supports a partially-static representation of both the
@@ -828,23 +828,23 @@ def AMDAIE_NpuCircularDmaCpyNdOp: AMDAIE_Op<"npu.circular_dma_cpy_nd", [
     // Build a NpuCircularDmaCpyNdOp with dynamic entries.
     OpBuilder<(ins "Value":$connection, "ValueRange":$target_offsets,
       "ValueRange":$target_sizes, "ValueRange":$target_strides,
-      "ValueRange":$source_offsets, "ValueRange":$source_sizes, 
+      "ValueRange":$source_offsets, "ValueRange":$source_sizes,
       "ValueRange":$source_strides)>
   ];
 
   let extraClassDeclaration = [{
     // Check whether this operation has addressing on the source side.
     bool hasSourceAddressing() {
-      return !getSourceMixedOffsets().empty() || !getSourceMixedSizes().empty() 
+      return !getSourceMixedOffsets().empty() || !getSourceMixedSizes().empty()
         || !getSourceMixedStrides().empty();
     }
-    
+
     // Check whether this operation has addressing on the target side.
-    bool hasTargetAddressing() { 
+    bool hasTargetAddressing() {
       return !getTargetMixedOffsets().empty() || !getTargetMixedSizes().empty()
         || !getTargetMixedStrides().empty();
     }
-    
+
     // Return the input connection operation.
     ConnectionOp getConnectionOp() {
       return dyn_cast_if_present<ConnectionOp>(getConnection().getDefiningOp());
@@ -852,7 +852,7 @@ def AMDAIE_NpuCircularDmaCpyNdOp: AMDAIE_Op<"npu.circular_dma_cpy_nd", [
 
     // Return the source memref type. This is retrieved using information from
     // the input DMA operation.
-    MemRefType getSourceMemrefType() { 
+    MemRefType getSourceMemrefType() {
       return cast<LogicalObjectFifoType>(getConnectionOp().getSourceType())
           .getElementType();
     }
@@ -865,7 +865,7 @@ def AMDAIE_NpuCircularDmaCpyNdOp: AMDAIE_Op<"npu.circular_dma_cpy_nd", [
 
     // Return the target memref type. This is retrieved using information from
     // the input DMA operation.
-    MemRefType getTargetMemrefType() { 
+    MemRefType getTargetMemrefType() {
       return cast<LogicalObjectFifoType>(getConnectionOp().getTargetType())
           .getElementType();
     }
@@ -896,7 +896,7 @@ def AMDAIE_NpuDmaWaitOp: AMDAIE_Op<"npu.dma_wait", []> {
   let summary = "Wait for the Npu DMA operation to complete.";
   let description = [{
     The wait operation will block on the referenced dependent ops.
-    
+
     If a dependent op returns a `!amdaie.async_token`, this wait op will block
     on the dependent op having completed execution.
     If a dependent op returns a `!amdaie.async_source_token`, this wait op will
@@ -919,7 +919,7 @@ def AMDAIE_NpuDmaWaitOp: AMDAIE_Op<"npu.dma_wait", []> {
     ```
 
     Here, the `dma_wait` operation will wait until the referenced Npu DMA
-    operation has started execution. On the other hand, the 
+    operation has started execution. On the other hand, the
     `!amdaie.async_target_token` can be used to wait on the target side of the
     DMA, i.e. until the DMA has finished its write into the target memory:
 
@@ -1026,7 +1026,7 @@ def AMDAIE_NpuTctSyncOp: AMDAIE_Op<"npu.tct_sync"> {
   let summary = "Wait for the TCTs to be emitted.";
   let description = [{
     This NPU controller operation to synchronize the Task Completion Tokens (TCTs)
-    on the specified `channel` and `direction`. The ranges of tiles to synchronize 
+    on the specified `channel` and `direction`. The ranges of tiles to synchronize
     are defined by [col, col+col_num) and [row, row+row_num).
 
     Example:
@@ -1077,16 +1077,16 @@ def AMDAIE_ConnectionOp: AMDAIE_Op<"connection",
         Variadic<Index>:$source_channels,
         OptionalAttr<AMDAIE_ConnectionTypeAttr>:$connection_type,
         Optional<Index>:$flow
-        
+
   );
   let results = (outs Index:$result);
 
-  let assemblyFormat = [{ 
+  let assemblyFormat = [{
     `(`
     $target
-    ( ` ` `{` $target_channels^  `}` )? 
+    ( ` ` `{` $target_channels^  `}` )?
     `,`
-    $source 
+    $source
     ( ` ` `{` $source_channels^  `}` )?
     ( `,` `flow` `=` $flow^ )?
     `)`
@@ -1096,7 +1096,7 @@ def AMDAIE_ConnectionOp: AMDAIE_Op<"connection",
 
   let builders = [
     OpBuilder<(ins "Value":$target, "::mlir::Value":$source)>,
-    OpBuilder<(ins "Value":$target, "::mlir::ValueRange":$target_channels, 
+    OpBuilder<(ins "Value":$target, "::mlir::ValueRange":$target_channels,
       "::mlir::Value":$source, "::mlir::ValueRange":$source_channels)>
   ];
 
@@ -1128,7 +1128,7 @@ def AMDAIE_LogicalObjectFifoAccessOp : AMDAIE_Op<"logicalobjectfifo.access"> {
       %0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<8x16xi32, 2>
         -> !amdaie.logicalobjectfifo<memref<8x16xi32, 2>>
       %core = amdaie.core(%tile, in : [], out : []) {
-        %1 = amdaie.logicalobjectfifo.access(%0, Read) : 
+        %1 = amdaie.logicalobjectfifo.access(%0, Read) :
           !amdaie.logicalobjectfifo<memref<8x16xi32, 2>> ->  memref<8x16xi32, 2>
     ```
   }];
@@ -1136,7 +1136,7 @@ def AMDAIE_LogicalObjectFifoAccessOp : AMDAIE_Op<"logicalobjectfifo.access"> {
   let arguments = (
     ins AnyAMDAIELogicalObjectFifoType:$input,
         MemoryAccess:$access_type
-  ); 
+  );
 
   let results = (outs AnyMemRef:$output);
 
@@ -1158,7 +1158,7 @@ def AMDAIE_LogicalObjectFifoAccessOp : AMDAIE_Op<"logicalobjectfifo.access"> {
   let cppNamespace = "mlir::iree_compiler::AMDAIE";
 }
 
-def AMDAIE_LogicalObjectFifoAcquire: 
+def AMDAIE_LogicalObjectFifoAcquire:
     AMDAIE_Op<"logicalobjectfifo.acquire", []> {
   let summary = "Semaphore operation to acquire objects from a logical"
                 "objectFifo DMA operation.";
@@ -1201,7 +1201,7 @@ def AMDAIE_LogicalObjectFifoAcquire:
 }
 
 def AMDAIE_LogicalObjectFifoFromBuffersOp
-    : AMDAIE_Op<"logicalobjectfifo.from_buffers", 
+    : AMDAIE_Op<"logicalobjectfifo.from_buffers",
     [DeclareOpInterfaceMethods<LogicalObjFifoOpInterface, ["replaceWithNewTiles"]>,
      Pure, AttrSizedOperandSegments]> {
   let summary = "Create a logical objectFifo from a set of buffers";
@@ -1218,7 +1218,7 @@ def AMDAIE_LogicalObjectFifoFromBuffersOp
       %lock = amdaie.lock(%tile(0), 2)
       %lock_1 = amdaie.lock(%tile(1), 0)
       %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1})
-      : memref<1024xi32, 1 : i32>, memref<1024xi32, 1 : i32> 
+      : memref<1024xi32, 1 : i32>, memref<1024xi32, 1 : i32>
       -> !amdaie.logicalobjectfifo<memref<1024xi32, 1 : i32>, 2>
     ```
   }];
@@ -1227,12 +1227,12 @@ def AMDAIE_LogicalObjectFifoFromBuffersOp
     ins Variadic<AnyMemRef>:$buffers,
         Variadic<Index>:$producerLocks,
         Variadic<Index>:$consumerLocks
-  ); 
+  );
 
   let results = (outs AnyAMDAIELogicalObjectFifoType:$output);
 
   let assemblyFormat = [{
-    `(` `{` $buffers `}` `,` `{` $producerLocks `}` `,` `{` $consumerLocks `}` `)` 
+    `(` `{` $buffers `}` `,` `{` $producerLocks `}` `,` `{` $consumerLocks `}` `)`
     attr-dict `:` type($buffers) `->` type($output)
   }];
 
@@ -1252,18 +1252,18 @@ def AMDAIE_LogicalObjectFifoFromBuffersOp
 
     // Helper method to return the memory space as an integer. If no memory
     // space attribute, this indicates a global memory space and we return 0.
-    // Else cast the memory space attribute to an integer. 
+    // Else cast the memory space attribute to an integer.
     uint8_t getMemorySpaceAsUInt() {
       Attribute memSpace = getMemorySpace();
       return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
     }
 
     // Return the source memref type.
-    MemRefType getMemrefType() { 
+    MemRefType getMemrefType() {
       return cast<LogicalObjectFifoType>(getOutput().getType())
         .getElementType();
     }
-  
+
     // Return the encapsulated buffers on the requested tile.
     llvm::SmallVector<BufferOp> getBuffersOnTile(TileOp tileOp);
 
@@ -1279,8 +1279,8 @@ def AMDAIE_LogicalObjectFifoFromBuffersOp
 }
 
 def AMDAIE_LogicalObjectFifoFromMemrefOp
-    : AMDAIE_Op<"logicalobjectfifo.from_memref", 
-    [DeclareOpInterfaceMethods<LogicalObjFifoOpInterface, ["replaceWithNewTiles"]>, 
+    : AMDAIE_Op<"logicalobjectfifo.from_memref",
+    [DeclareOpInterfaceMethods<LogicalObjFifoOpInterface, ["replaceWithNewTiles"]>,
      Pure]> {
   let summary = "Create a logical objectFifo from a memref";
   let description = [{
@@ -1300,7 +1300,7 @@ def AMDAIE_LogicalObjectFifoFromMemrefOp
   let arguments = (
     ins AnyMemRef:$memref,
         Variadic<Index>:$tiles
-  ); 
+  );
 
   let results = (outs AnyAMDAIELogicalObjectFifoType:$output);
 
@@ -1314,7 +1314,7 @@ def AMDAIE_LogicalObjectFifoFromMemrefOp
     // Build `LogicalObjectFifoFromMemrefOp` with an array of static tile
     // locations.
     OpBuilder<
-      (ins "mlir::Value":$memref, 
+      (ins "mlir::Value":$memref,
            "::llvm::ArrayRef<std::pair<int64_t, int64_t>>":$tileLocations)>
   ];
 
@@ -1332,14 +1332,14 @@ def AMDAIE_LogicalObjectFifoFromMemrefOp
 
     // Helper method to return the memory space as an integer. If no memory
     // space attribute, this indicates a global memory space and we return 0.
-    // Else cast the memory space attribute to an integer. 
+    // Else cast the memory space attribute to an integer.
     uint8_t getMemorySpaceAsUInt() {
       Attribute memSpace = getMemorySpace();
       return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
     }
 
     // Return the source memref type.
-    MemRefType getMemrefType() { 
+    MemRefType getMemrefType() {
       return cast<LogicalObjectFifoType>(getOutput().getType())
         .getElementType();
     }
@@ -1353,7 +1353,7 @@ def AMDAIE_LogicalObjectFifoFromMemrefOp
   let cppNamespace = "mlir::iree_compiler::AMDAIE";
 }
 
-def AMDAIE_LogicalObjectFifoPlaceholderOp: 
+def AMDAIE_LogicalObjectFifoPlaceholderOp:
     AMDAIE_Op<"logicalobjectfifo.placeholder", [
       DeclareOpInterfaceMethods<LogicalObjFifoOpInterface, ["replaceWithNewTiles"]>,
       Pure]> {
@@ -1366,12 +1366,12 @@ def AMDAIE_LogicalObjectFifoPlaceholderOp:
 
     Example:
     ```mlir
-    %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) 
+    %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0)
       binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1024xi32>
     %alloc = memref.alloc() : memref<1024xi32, 1 : i32>
     %obj0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_1}
       : memref<1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1 : i32>>
-    %ph = amdaie.logicalobjectfifo.placeholder{} 
+    %ph = amdaie.logicalobjectfifo.placeholder{}
       : !amdaie.logicalobjectfifo<memref<2048xi32>>
     %connection = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %ph[] [] [])
       : (!amdaie.logicalobjectfifo<memref<1024xi32, 1 : i32>>,
@@ -1379,8 +1379,8 @@ def AMDAIE_LogicalObjectFifoPlaceholderOp:
     amdaie.controlcode {
       %obj1 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0}
         : memref<1024xi32> -> !amdaie.logicalobjectfifo<memref<1024xi32>>
-      %npu_dma = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], 
-        %obj0[%c0, %c32] [%c32, %c32] [%c32, %c1]) 
+      %npu_dma = amdaie.npu.dma_cpy_nd async_source %connection([] [] [],
+        %obj0[%c0, %c32] [%c32, %c32] [%c32, %c1])
         : source_type = !amdaie.logicalobjectfifo<memref<1024xi32>>
       amdaie.end
     }
@@ -1407,21 +1407,21 @@ def AMDAIE_LogicalObjectFifoPlaceholderOp:
 
     // Helper method to return the memory space as an integer. If no memory
     // space attribute, this indicates a global memory space and we return 0.
-    // Else cast the memory space attribute to an integer. 
+    // Else cast the memory space attribute to an integer.
     uint8_t getMemorySpaceAsUInt() {
       Attribute memSpace = getMemorySpace();
       return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
     }
 
     // Return the source memref type.
-    MemRefType getMemrefType() { 
+    MemRefType getMemrefType() {
       return cast<LogicalObjectFifoType>(getOutput().getType())
         .getElementType();
     }
   }];
 }
 
-def AMDAIE_LogicalObjectFifoRelease: 
+def AMDAIE_LogicalObjectFifoRelease:
     AMDAIE_Op<"logicalobjectfifo.release", []> {
   let summary = "Semaphore operation to release objects from a logical"
                 "objectFifo DMA operation.";
@@ -1472,7 +1472,7 @@ class AMDAIE_DmaCpyNdBaseOp<string mnemonic, list<Trait> traits = []> :
     The representation supports a partially-static representation of both the source and
     target `offsets`, `sizes` and `strides`. A special sentinel value ShapedType::kDynamic
     encodes that the corresponding entry has a dynamic value.
-    
+
     Example:
 
     ```mlir
@@ -1550,7 +1550,7 @@ class AMDAIE_DmaCpyNdBaseOp<string mnemonic, list<Trait> traits = []> :
     std::optional<Attribute> getTargetMemorySpace() {
       return cast<LogicalObjectFifoType>(getTargetType()).getMemorySpace();
     }
-    
+
     // A utility to create a new doubly strided operation from this one with a
     // new set of source and target offsets, sizes and strides.
     DoublyStridedOpInterface createDoublyStridedOp(
@@ -1574,7 +1574,7 @@ def AMDAIE_DmaCpyNdOp: AMDAIE_DmaCpyNdBaseOp<"dma_cpy_nd", []> {
     which is similar, but keeps copying data indefinitely, waiting for data to be
     produced on the source logical objectFifo and producing into the target logical
     objectFifo.
-    
+
     Example:
 
     ```mlir
@@ -1596,10 +1596,10 @@ def AMDAIE_CircularDmaCpyNdOp: AMDAIE_DmaCpyNdBaseOp<"circular_dma_cpy_nd", [Pur
   let description = [{
     The DMA operation represents a strided copy operation with an unlimited number of
     dimensions from a source to a target logical objectFifo as specified by the
-    `DmaCpyNdBaseOp` base class. This operation keeps copying data indefinitely, waiting 
+    `DmaCpyNdBaseOp` base class. This operation keeps copying data indefinitely, waiting
     for data to be produced on the source logical objectFifo and producing into the target
     logical objectfifo.
-    
+
     Example:
 
     ```mlir
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.cpp
index c29d71c98..3b7654330 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.cpp
@@ -4,8 +4,9 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree-amd-aie/IR/AMDAIEDialect.h"
 #include "iree-amd-aie/IR/AMDAIETypes.h"
+
+#include "iree-amd-aie/IR/AMDAIEDialect.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.h
index 5c84f98a3..70fe7b1e0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.h
@@ -7,14 +7,13 @@
 #ifndef IREE_COMPILER_AMDAIE_TYPES_H_
 #define IREE_COMPILER_AMDAIE_TYPES_H_
 
+#include "iree-amd-aie/IR/AMDAIEAttrs.h"
+#include "iree-amd-aie/IR/AMDAIEDmaOpInterface.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 
-#include "iree-amd-aie/IR/AMDAIEAttrs.h"
-#include "iree-amd-aie/IR/AMDAIEDmaOpInterface.h"
-
 // clang-format off
 #define GET_TYPEDEF_CLASSES
 #include "iree-amd-aie/IR/AMDAIETypes.h.inc"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.td
index 1ff011c0b..bc4eecad7 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.td
@@ -22,7 +22,7 @@ class AMDAIEDialect_Type<string name, string typeMnemonic, list<Trait> traits =
 def AMDAIE_AsyncTokenType : AMDAIEDialect_Type<"AsyncToken", "async_token">;
 def AMDAIE_AsyncSourceTokenType : AMDAIEDialect_Type<"AsyncSourceToken", "async_source_token">;
 def AMDAIE_AsyncTargetTokenType : AMDAIEDialect_Type<"AsyncTargetToken", "async_target_token">;
-def AMDAIE_AnyAsyncTokenType 
+def AMDAIE_AnyAsyncTokenType
   : AnyTypeOf<[AMDAIE_AsyncTokenType, AMDAIE_AsyncSourceTokenType, AMDAIE_AsyncTargetTokenType]>;
 
 def AMDAIE_LogicalObjectFifoType :
@@ -30,7 +30,7 @@ def AMDAIE_LogicalObjectFifoType :
   let summary = "The logical objectfifo type encapsulating a memref";
   let description = [{
     The logical objectfifo type encapulates a memref and provides synchronized
-    access operations to retrieve the underlying memref. This type is similar and 
+    access operations to retrieve the underlying memref. This type is similar and
     based on the MLIR-AIE ObjectFifo type. For now, this type only works with
     static memrefs.
 
@@ -45,7 +45,7 @@ def AMDAIE_LogicalObjectFifoType :
   let genVerifyDecl = 1;
 
   let assemblyFormat = "`<` $element_type (`,` $depth^)? `>`";
-  
+
   let builders = [
     TypeBuilderWithInferredContext<(ins "MemRefType":$elementType), [{
       return $_get(elementType.getContext(), elementType, 1);
@@ -60,9 +60,9 @@ def AMDAIE_LogicalObjectFifoType :
       return getElementType().getMemorySpace();
     }
 
-    /// Helper method to return the memory space as an integer. If no memory 
-    /// space attribute exists, this indicates a global memory space and we 
-    /// return 0. Else we cast the memory space attribute to an integer. 
+    /// Helper method to return the memory space as an integer. If no memory
+    /// space attribute exists, this indicates a global memory space and we
+    /// return 0. Else we cast the memory space attribute to an integer.
     uint8_t getMemorySpaceAsUInt() {
       Attribute memSpace = getMemorySpace();
       return memSpace ? llvm::cast<IntegerAttr>(memSpace).getInt() : 0;
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
index 81ec50b26..24c150ede 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
@@ -229,7 +229,6 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend {
 
   void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr,
                                     OpPassManager &passManager) override {
-
     buildAMDAIETransformPassPipeline(
         passManager, options.AMDAIETargetDevice, options.AMDAIENumRows,
         options.AMDAIENumCols, options.useTilePipeline,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h
index 7f5f661e6..d2caf9863 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h
@@ -24,7 +24,7 @@ mlir::LogicalResult aie2xclbin(
     const std::string &xclBinInstanceName, const std::string &amdAIEInstallDir,
     const std::optional<std::string> &InputXCLBin,
     const std::optional<std::string> &ukernel,
-    const std::string & additionalPeanoOptFlags);
+    const std::string &additionalPeanoOptFlags);
 
 mlir::LogicalResult emitNpuInstructions(xilinx::AIE::DeviceOp deviceOp,
                                         const std::string &outputNPU);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
index e61b79459..a090f73c5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
@@ -62,7 +62,7 @@ void matmul_vectorized_bf16_f32(const bfloat16 * __restrict pA, unsigned offsetA
   const unsigned size_B = L0_K * L0_N;
   const unsigned size_C = L0_M * L0_N;
   using MMUL = aie::detail::mmul_bfp16_bfp16<L0_M, L0_K, L0_N, bfp16ebs8, bfp16ebs8, 32>;
-  
+
   v32accfloat * restrict pOut = (v32accfloat *) (pC + offsetC);
 
   for (unsigned z = 0; z < rowA; z += 2)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/bd_chaining.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/bd_chaining.mlir
index 8e98f6e6a..1569c2939 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/bd_chaining.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/bd_chaining.mlir
@@ -20,12 +20,12 @@ aie.device(npu1_4col) {
   %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i8}
   %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i8}
   %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i8}
-  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<8x16xi32> 
-  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<16x32xi32> 
-  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<8x32xi32> 
-  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<2x2x4x8xi32> 
-  %buf1 = aie.buffer(%tile_0_2) {address = 1536 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<8x2x8x4xi32> 
-  %buf0 = aie.buffer(%tile_0_2) {address = 3584 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<8x2x4x4xi32> 
+  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<8x16xi32>
+  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<16x32xi32>
+  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<8x32xi32>
+  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<2x2x4x8xi32>
+  %buf1 = aie.buffer(%tile_0_2) {address = 1536 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<8x2x8x4xi32>
+  %buf0 = aie.buffer(%tile_0_2) {address = 3584 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<8x2x4x4xi32>
   %mem_0_2 = aie.mem(%tile_0_2) {
     %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
   ^bb1:  // 2 preds: ^bb0, ^bb1
@@ -321,210 +321,210 @@ aie.device(npu1_4col) {
 // CHECK: XAIE API: XAie_EnableShimDmaToAieStrmPort with args: devInst=ptr, tileLoc=TileLoc(col: 0, row: 0), connect.dst.channel=3
 // CHECK: XAIE API: XAie_EnableShimDmaToAieStrmPort with args: devInst=ptr, tileLoc=TileLoc(col: 0, row: 0), connect.dst.channel=7
 // CHECK: XAIE API: XAie_EnableAieToShimDmaStrmPort with args: devInst=ptr, tileLoc=TileLoc(col: 0, row: 0), connect.src.channel=2
-// CHECK: cdo-driver: (NOP Command): Payload Length: 0 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 0 
-// CHECK: cdo-driver: (Write64): Address:  0x00000000021C0010 Data:  0x00000001  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000021C0000 Data:  0x00000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000001C0010 Data:  0x00000001  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000001C0000 Data:  0x00000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000041C0010 Data:  0x00000001  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000041C0000 Data:  0x00000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000021F050 Data:  0x00000001  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000021F040 Data:  0x00000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000021F030 Data:  0x00000001  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000021F020 Data:  0x00000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000021F010 Data:  0x00000001  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000021F000 Data:  0x00000000  
-// CHECK: cdo-driver: (NOP Command): Payload Length: 2 
+// CHECK: cdo-driver: (NOP Command): Payload Length: 0
+// CHECK: cdo-driver: (NOP Command): Payload Length: 0
+// CHECK: cdo-driver: (Write64): Address:  0x00000000021C0010 Data:  0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x00000000021C0000 Data:  0x00000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000001C0010 Data:  0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x00000000001C0000 Data:  0x00000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000041C0010 Data:  0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x00000000041C0000 Data:  0x00000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000021F050 Data:  0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x000000000021F040 Data:  0x00000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000021F030 Data:  0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x000000000021F020 Data:  0x00000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000021F010 Data:  0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x000000000021F000 Data:  0x00000000
+// CHECK: cdo-driver: (NOP Command): Payload Length: 2
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x000000000021D000  Size: 6
-// CHECK: cdo-driver:     Address: 0x000000000021D000  Data is: 0x00400080 
-// CHECK: cdo-driver:     Address: 0x000000000021D004  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D008  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D00C  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D010  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D014  Data is: 0x06045FE3 
+// CHECK: cdo-driver:     Address: 0x000000000021D000  Data is: 0x00400080
+// CHECK: cdo-driver:     Address: 0x000000000021D004  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D008  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D00C  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D010  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D014  Data is: 0x06045FE3
 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 2 
+// CHECK: cdo-driver: (NOP Command): Payload Length: 2
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x000000000021D020  Size: 6
-// CHECK: cdo-driver:     Address: 0x000000000021D020  Data is: 0x00600100 
-// CHECK: cdo-driver:     Address: 0x000000000021D024  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D028  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D02C  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D030  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D034  Data is: 0x16001FE5 
+// CHECK: cdo-driver:     Address: 0x000000000021D020  Data is: 0x00600100
+// CHECK: cdo-driver:     Address: 0x000000000021D024  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D028  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D02C  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D030  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D034  Data is: 0x16001FE5
 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 2 
+// CHECK: cdo-driver: (NOP Command): Payload Length: 2
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x000000000021D040  Size: 6
-// CHECK: cdo-driver:     Address: 0x000000000021D040  Data is: 0x00A00100 
-// CHECK: cdo-driver:     Address: 0x000000000021D044  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D048  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D04C  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D050  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D054  Data is: 0x0E048000 
+// CHECK: cdo-driver:     Address: 0x000000000021D040  Data is: 0x00A00100
+// CHECK: cdo-driver:     Address: 0x000000000021D044  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D048  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D04C  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D050  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D054  Data is: 0x0E048000
 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 2 
+// CHECK: cdo-driver: (NOP Command): Payload Length: 2
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x000000000021D060  Size: 6
-// CHECK: cdo-driver:     Address: 0x000000000021D060  Data is: 0x00E00040 
-// CHECK: cdo-driver:     Address: 0x000000000021D064  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D068  Data is: 0x0003E000 
-// CHECK: cdo-driver:     Address: 0x000000000021D06C  Data is: 0x01008003 
-// CHECK: cdo-driver:     Address: 0x000000000021D070  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D074  Data is: 0x26001FE0 
+// CHECK: cdo-driver:     Address: 0x000000000021D060  Data is: 0x00E00040
+// CHECK: cdo-driver:     Address: 0x000000000021D064  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D068  Data is: 0x0003E000
+// CHECK: cdo-driver:     Address: 0x000000000021D06C  Data is: 0x01008003
+// CHECK: cdo-driver:     Address: 0x000000000021D070  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D074  Data is: 0x26001FE0
 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 2 
+// CHECK: cdo-driver: (NOP Command): Payload Length: 2
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x000000000021D080  Size: 6
-// CHECK: cdo-driver:     Address: 0x000000000021D080  Data is: 0x00E20040 
-// CHECK: cdo-driver:     Address: 0x000000000021D084  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D088  Data is: 0x0003E000 
-// CHECK: cdo-driver:     Address: 0x000000000021D08C  Data is: 0x01008003 
-// CHECK: cdo-driver:     Address: 0x000000000021D090  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D094  Data is: 0x2E000000 
+// CHECK: cdo-driver:     Address: 0x000000000021D080  Data is: 0x00E20040
+// CHECK: cdo-driver:     Address: 0x000000000021D084  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D088  Data is: 0x0003E000
+// CHECK: cdo-driver:     Address: 0x000000000021D08C  Data is: 0x01008003
+// CHECK: cdo-driver:     Address: 0x000000000021D090  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D094  Data is: 0x2E000000
 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 2 
+// CHECK: cdo-driver: (NOP Command): Payload Length: 2
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x000000000021D0A0  Size: 6
-// CHECK: cdo-driver:     Address: 0x000000000021D0A0  Data is: 0x00E40040 
-// CHECK: cdo-driver:     Address: 0x000000000021D0A4  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D0A8  Data is: 0x0003E000 
-// CHECK: cdo-driver:     Address: 0x000000000021D0AC  Data is: 0x01008003 
-// CHECK: cdo-driver:     Address: 0x000000000021D0B0  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D0B4  Data is: 0x36000000 
+// CHECK: cdo-driver:     Address: 0x000000000021D0A0  Data is: 0x00E40040
+// CHECK: cdo-driver:     Address: 0x000000000021D0A4  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D0A8  Data is: 0x0003E000
+// CHECK: cdo-driver:     Address: 0x000000000021D0AC  Data is: 0x01008003
+// CHECK: cdo-driver:     Address: 0x000000000021D0B0  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D0B4  Data is: 0x36000000
 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 2 
+// CHECK: cdo-driver: (NOP Command): Payload Length: 2
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x000000000021D0C0  Size: 6
-// CHECK: cdo-driver:     Address: 0x000000000021D0C0  Data is: 0x00E60040 
-// CHECK: cdo-driver:     Address: 0x000000000021D0C4  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D0C8  Data is: 0x0003E000 
-// CHECK: cdo-driver:     Address: 0x000000000021D0CC  Data is: 0x01008003 
-// CHECK: cdo-driver:     Address: 0x000000000021D0D0  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x000000000021D0D4  Data is: 0x1E042000 
+// CHECK: cdo-driver:     Address: 0x000000000021D0C0  Data is: 0x00E60040
+// CHECK: cdo-driver:     Address: 0x000000000021D0C4  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D0C8  Data is: 0x0003E000
+// CHECK: cdo-driver:     Address: 0x000000000021D0CC  Data is: 0x01008003
+// CHECK: cdo-driver:     Address: 0x000000000021D0D0  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x000000000021D0D4  Data is: 0x1E042000
 
-// CHECK: cdo-driver: (Write64): Address:  0x000000000021DE04 Data:  0x00010000  
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000021DE00  Mask: 0x00000000  Data: 0x00000001 
-// CHECK: cdo-driver: (Write64): Address:  0x000000000021DE0C Data:  0x00010001  
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000021DE08  Mask: 0x00000000  Data: 0x00000001 
-// CHECK: cdo-driver: (Write64): Address:  0x000000000021DE14 Data:  0x00010003  
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000021DE10  Mask: 0x00000000  Data: 0x00000001 
+// CHECK: cdo-driver: (Write64): Address:  0x000000000021DE04 Data:  0x00010000
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000021DE00  Mask: 0x00000000  Data: 0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x000000000021DE0C Data:  0x00010001
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000021DE08  Mask: 0x00000000  Data: 0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x000000000021DE14 Data:  0x00010003
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000021DE10  Mask: 0x00000000  Data: 0x00000001
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x00000000041A0000  Size: 8
-// CHECK: cdo-driver:     Address: 0x00000000041A0000  Data is: 0x00000100 
-// CHECK: cdo-driver:     Address: 0x00000000041A0004  Data is: 0x000A0000 
-// CHECK: cdo-driver:     Address: 0x00000000041A0008  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000041A000C  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000041A0010  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000041A0014  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000041A0018  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000041A001C  Data is: 0x8140FF41 
+// CHECK: cdo-driver:     Address: 0x00000000041A0000  Data is: 0x00000100
+// CHECK: cdo-driver:     Address: 0x00000000041A0004  Data is: 0x000A0000
+// CHECK: cdo-driver:     Address: 0x00000000041A0008  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000041A000C  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000041A0010  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000041A0014  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000041A0018  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000041A001C  Data is: 0x8140FF41
 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 0 
+// CHECK: cdo-driver: (NOP Command): Payload Length: 0
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x00000000041A0020  Size: 8
-// CHECK: cdo-driver:     Address: 0x00000000041A0020  Data is: 0x00000100 
-// CHECK: cdo-driver:     Address: 0x00000000041A0024  Data is: 0x001A0000 
-// CHECK: cdo-driver:     Address: 0x00000000041A0028  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000041A002C  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000041A0030  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000041A0034  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000041A0038  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000041A003C  Data is: 0x8141FF40 
+// CHECK: cdo-driver:     Address: 0x00000000041A0020  Data is: 0x00000100
+// CHECK: cdo-driver:     Address: 0x00000000041A0024  Data is: 0x001A0000
+// CHECK: cdo-driver:     Address: 0x00000000041A0028  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000041A002C  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000041A0030  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000041A0034  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000041A0038  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000041A003C  Data is: 0x8141FF40
 
-// CHECK: cdo-driver: (Write64): Address:  0x00000000041A0604 Data:  0x00010000  
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000041A0600  Mask: 0x00000000  Data: 0x00000001 
-// CHECK: cdo-driver: (Write64): Address:  0x00000000041A0634 Data:  0x00010001  
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000041A0630  Mask: 0x00000000  Data: 0x00000001 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 2 
+// CHECK: cdo-driver: (Write64): Address:  0x00000000041A0604 Data:  0x00010000
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000041A0600  Mask: 0x00000000  Data: 0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x00000000041A0634 Data:  0x00010001
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000041A0630  Mask: 0x00000000  Data: 0x00000001
+// CHECK: cdo-driver: (NOP Command): Payload Length: 2
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x00000000001A0000  Size: 8
-// CHECK: cdo-driver:     Address: 0x00000000001A0000  Data is: 0x00000080 
-// CHECK: cdo-driver:     Address: 0x00000000001A0004  Data is: 0x000A0000 
-// CHECK: cdo-driver:     Address: 0x00000000001A0008  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000001A000C  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000001A0010  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000001A0014  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000001A0018  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000001A001C  Data is: 0x8140FF41 
+// CHECK: cdo-driver:     Address: 0x00000000001A0000  Data is: 0x00000080
+// CHECK: cdo-driver:     Address: 0x00000000001A0004  Data is: 0x000A0000
+// CHECK: cdo-driver:     Address: 0x00000000001A0008  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000001A000C  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000001A0010  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000001A0014  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000001A0018  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000001A001C  Data is: 0x8140FF41
 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 0 
+// CHECK: cdo-driver: (NOP Command): Payload Length: 0
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x00000000001A0020  Size: 8
-// CHECK: cdo-driver:     Address: 0x00000000001A0020  Data is: 0x00000080 
-// CHECK: cdo-driver:     Address: 0x00000000001A0024  Data is: 0x001A0000 
-// CHECK: cdo-driver:     Address: 0x00000000001A0028  Data is: 0x00100000 
-// CHECK: cdo-driver:     Address: 0x00000000001A002C  Data is: 0x0010000F 
-// CHECK: cdo-driver:     Address: 0x00000000001A0030  Data is: 0x00040007 
-// CHECK: cdo-driver:     Address: 0x00000000001A0034  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000001A0038  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000001A003C  Data is: 0x8141FF40 
+// CHECK: cdo-driver:     Address: 0x00000000001A0020  Data is: 0x00000080
+// CHECK: cdo-driver:     Address: 0x00000000001A0024  Data is: 0x001A0000
+// CHECK: cdo-driver:     Address: 0x00000000001A0028  Data is: 0x00100000
+// CHECK: cdo-driver:     Address: 0x00000000001A002C  Data is: 0x0010000F
+// CHECK: cdo-driver:     Address: 0x00000000001A0030  Data is: 0x00040007
+// CHECK: cdo-driver:     Address: 0x00000000001A0034  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000001A0038  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000001A003C  Data is: 0x8141FF40
 
-// CHECK: cdo-driver: (Write64): Address:  0x00000000001A0604 Data:  0x00010000  
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000001A0600  Mask: 0x00000000  Data: 0x00000001 
-// CHECK: cdo-driver: (Write64): Address:  0x00000000001A0634 Data:  0x00010001  
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000001A0630  Mask: 0x00000000  Data: 0x00000001 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 2 
+// CHECK: cdo-driver: (Write64): Address:  0x00000000001A0604 Data:  0x00010000
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000001A0600  Mask: 0x00000000  Data: 0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x00000000001A0634 Data:  0x00010001
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000001A0630  Mask: 0x00000000  Data: 0x00000001
+// CHECK: cdo-driver: (NOP Command): Payload Length: 2
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x00000000021A0000  Size: 8
-// CHECK: cdo-driver:     Address: 0x00000000021A0000  Data is: 0x00000200 
-// CHECK: cdo-driver:     Address: 0x00000000021A0004  Data is: 0x000A0000 
-// CHECK: cdo-driver:     Address: 0x00000000021A0008  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000021A000C  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000021A0010  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000021A0014  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000021A0018  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000021A001C  Data is: 0x8140FF41 
+// CHECK: cdo-driver:     Address: 0x00000000021A0000  Data is: 0x00000200
+// CHECK: cdo-driver:     Address: 0x00000000021A0004  Data is: 0x000A0000
+// CHECK: cdo-driver:     Address: 0x00000000021A0008  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000021A000C  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000021A0010  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000021A0014  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000021A0018  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000021A001C  Data is: 0x8140FF41
 
-// CHECK: cdo-driver: (NOP Command): Payload Length: 0 
+// CHECK: cdo-driver: (NOP Command): Payload Length: 0
 // CHECK: cdo-driver: (BlockWrite-DMAWriteCmd): Start Address: 0x00000000021A0020  Size: 8
-// CHECK: cdo-driver:     Address: 0x00000000021A0020  Data is: 0x00000200 
-// CHECK: cdo-driver:     Address: 0x00000000021A0024  Data is: 0x001A0000 
-// CHECK: cdo-driver:     Address: 0x00000000021A0028  Data is: 0x00080000 
-// CHECK: cdo-driver:     Address: 0x00000000021A002C  Data is: 0x0020001F 
-// CHECK: cdo-driver:     Address: 0x00000000021A0030  Data is: 0x00100003 
-// CHECK: cdo-driver:     Address: 0x00000000021A0034  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000021A0038  Data is: 0x00000000 
-// CHECK: cdo-driver:     Address: 0x00000000021A003C  Data is: 0x8141FF40 
+// CHECK: cdo-driver:     Address: 0x00000000021A0020  Data is: 0x00000200
+// CHECK: cdo-driver:     Address: 0x00000000021A0024  Data is: 0x001A0000
+// CHECK: cdo-driver:     Address: 0x00000000021A0028  Data is: 0x00080000
+// CHECK: cdo-driver:     Address: 0x00000000021A002C  Data is: 0x0020001F
+// CHECK: cdo-driver:     Address: 0x00000000021A0030  Data is: 0x00100003
+// CHECK: cdo-driver:     Address: 0x00000000021A0034  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000021A0038  Data is: 0x00000000
+// CHECK: cdo-driver:     Address: 0x00000000021A003C  Data is: 0x8141FF40
 
-// CHECK: cdo-driver: (Write64): Address:  0x00000000021A0604 Data:  0x00010000  
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000021A0600  Mask: 0x00000000  Data: 0x00000001 
-// CHECK: cdo-driver: (Write64): Address:  0x00000000021A0634 Data:  0x00010001  
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000021A0630  Mask: 0x00000000  Data: 0x00000001 
-// CHECK: cdo-driver: (Write64): Address:  0x000000000003F008 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000003F100 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000003F030 Data:  0x80000005  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000003F114 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000003F048 Data:  0x80000009  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000003F124 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000003F010 Data:  0x80000012  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000003F148 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000001B0000 Data:  0x80000007  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000001B011C Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000001B002C Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000001B0100 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000203F008 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000203F100 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000203F030 Data:  0x8000000A  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000203F128 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000203F020 Data:  0x80000012  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000203F148 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000021B0000 Data:  0x80000007  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000021B011C Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000021B002C Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000021B0100 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000403F008 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000403F100 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000403F020 Data:  0x8000000E  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000403F138 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000041B001C Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000041B0100 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000041B0000 Data:  0x8000000D  
-// CHECK: cdo-driver: (Write64): Address:  0x00000000041B0134 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000023F004 Data:  0x80000005  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000023F114 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000023F008 Data:  0x80000013  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000023F14C Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000023F04C Data:  0x80000001  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000023F104 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000223F024 Data:  0x80000005  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000223F114 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000223F04C Data:  0x8000000B  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000223F12C Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000423F014 Data:  0x8000000B  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000423F12C Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000003F008 Data:  0x80000000  
-// CHECK: cdo-driver: (Write64): Address:  0x000000000003F100 Data:  0x80000000  
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000001F000  Mask: 0x00000C00  Data: 0x00000400 
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000001F000  Mask: 0x0000C000  Data: 0x00004000 
-// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000001F004  Mask: 0x00000030  Data: 0x00000010 
+// CHECK: cdo-driver: (Write64): Address:  0x00000000021A0604 Data:  0x00010000
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000021A0600  Mask: 0x00000000  Data: 0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x00000000021A0634 Data:  0x00010001
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x00000000021A0630  Mask: 0x00000000  Data: 0x00000001
+// CHECK: cdo-driver: (Write64): Address:  0x000000000003F008 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000003F100 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000003F030 Data:  0x80000005
+// CHECK: cdo-driver: (Write64): Address:  0x000000000003F114 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000003F048 Data:  0x80000009
+// CHECK: cdo-driver: (Write64): Address:  0x000000000003F124 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000003F010 Data:  0x80000012
+// CHECK: cdo-driver: (Write64): Address:  0x000000000003F148 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000001B0000 Data:  0x80000007
+// CHECK: cdo-driver: (Write64): Address:  0x00000000001B011C Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000001B002C Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000001B0100 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000203F008 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000203F100 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000203F030 Data:  0x8000000A
+// CHECK: cdo-driver: (Write64): Address:  0x000000000203F128 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000203F020 Data:  0x80000012
+// CHECK: cdo-driver: (Write64): Address:  0x000000000203F148 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000021B0000 Data:  0x80000007
+// CHECK: cdo-driver: (Write64): Address:  0x00000000021B011C Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000021B002C Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000021B0100 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000403F008 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000403F100 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000403F020 Data:  0x8000000E
+// CHECK: cdo-driver: (Write64): Address:  0x000000000403F138 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000041B001C Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000041B0100 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x00000000041B0000 Data:  0x8000000D
+// CHECK: cdo-driver: (Write64): Address:  0x00000000041B0134 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000023F004 Data:  0x80000005
+// CHECK: cdo-driver: (Write64): Address:  0x000000000023F114 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000023F008 Data:  0x80000013
+// CHECK: cdo-driver: (Write64): Address:  0x000000000023F14C Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000023F04C Data:  0x80000001
+// CHECK: cdo-driver: (Write64): Address:  0x000000000023F104 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000223F024 Data:  0x80000005
+// CHECK: cdo-driver: (Write64): Address:  0x000000000223F114 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000223F04C Data:  0x8000000B
+// CHECK: cdo-driver: (Write64): Address:  0x000000000223F12C Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000423F014 Data:  0x8000000B
+// CHECK: cdo-driver: (Write64): Address:  0x000000000423F12C Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000003F008 Data:  0x80000000
+// CHECK: cdo-driver: (Write64): Address:  0x000000000003F100 Data:  0x80000000
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000001F000  Mask: 0x00000C00  Data: 0x00000400
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000001F000  Mask: 0x0000C000  Data: 0x00004000
+// CHECK: cdo-driver: (MaskWrite64): Address: 0x000000000001F004  Mask: 0x00000030  Data: 0x00000010
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_16x16_8xi32__dispatch_0_matmul_16x1_0.aiecc.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_16x16_8xi32__dispatch_0_matmul_16x1_0.aiecc.mlir
index 86e891fb2..96f64f88a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_16x16_8xi32__dispatch_0_matmul_16x1_0.aiecc.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_16x16_8xi32__dispatch_0_matmul_16x1_0.aiecc.mlir
@@ -19,12 +19,12 @@ aie.device(npu1_4col) {
   %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i8}
   %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i8}
   %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i8}
-  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<16x8xi32> 
-  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<8x16xi32> 
-  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<16x16xi32> 
-  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<1x4x4x8xi32> 
-  %buf1 = aie.buffer(%tile_0_2) {address = 1536 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<4x1x8x4xi32> 
-  %buf0 = aie.buffer(%tile_0_2) {address = 2048 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<4x4x4x4xi32> 
+  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<16x8xi32>
+  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<8x16xi32>
+  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<16x16xi32>
+  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<1x4x4x8xi32>
+  %buf1 = aie.buffer(%tile_0_2) {address = 1536 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<4x1x8x4xi32>
+  %buf0 = aie.buffer(%tile_0_2) {address = 2048 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<4x4x4x4xi32>
   %mem_0_2 = aie.mem(%tile_0_2) {
     %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
   ^bb1:  // 2 preds: ^bb0, ^bb1
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_16x64_32xi8__dispatch_0_matmul_tran_0.aiecc.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_16x64_32xi8__dispatch_0_matmul_tran_0.aiecc.mlir
index 6e7e7f175..0b1b9def4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_16x64_32xi8__dispatch_0_matmul_tran_0.aiecc.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_16x64_32xi8__dispatch_0_matmul_tran_0.aiecc.mlir
@@ -19,12 +19,12 @@ aie.device(npu1_4col) {
   %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i8}
   %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i8}
   %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i8}
-  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<16x64xi8> 
-  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<32x64xi8> 
-  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<16x32xi32> 
-  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<8x4x4x8xi8> 
-  %buf1 = aie.buffer(%tile_0_2) {address = 2048 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<8x4x8x8xi8> 
-  %buf0 = aie.buffer(%tile_0_2) {address = 4096 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<4x4x4x8xi32> 
+  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<16x64xi8>
+  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<32x64xi8>
+  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<16x32xi32>
+  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<8x4x4x8xi8>
+  %buf1 = aie.buffer(%tile_0_2) {address = 2048 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<8x4x8x8xi8>
+  %buf0 = aie.buffer(%tile_0_2) {address = 4096 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<4x4x4x8xi32>
   %mem_0_2 = aie.mem(%tile_0_2) {
     %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
   ^bb1:  // 2 preds: ^bb0, ^bb1
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_64x64_64xbf16__dispatch_0_matmul_64_0.aiecc.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_64x64_64xbf16__dispatch_0_matmul_64_0.aiecc.mlir
index 234671059..7052d005c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_64x64_64xbf16__dispatch_0_matmul_64_0.aiecc.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_64x64_64xbf16__dispatch_0_matmul_64_0.aiecc.mlir
@@ -19,12 +19,12 @@ aie.device(npu1_4col) {
   %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i8}
   %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i8}
   %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i8}
-  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<64x64xbf16> 
-  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<64x64xbf16> 
-  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<64x64xf32> 
-  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<8x16x4x8xbf16> 
-  %buf1 = aie.buffer(%tile_0_2) {address = 9216 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<16x8x8x4xbf16> 
-  %buf0 = aie.buffer(%tile_0_2) {address = 17408 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<16x16x4x4xf32> 
+  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<64x64xbf16>
+  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<64x64xbf16>
+  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<64x64xf32>
+  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<8x16x4x8xbf16>
+  %buf1 = aie.buffer(%tile_0_2) {address = 9216 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<16x8x8x4xbf16>
+  %buf0 = aie.buffer(%tile_0_2) {address = 17408 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<16x16x4x4xf32>
   %mem_0_2 = aie.mem(%tile_0_2) {
     %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
   ^bb1:  // 2 preds: ^bb0, ^bb1
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_64x64_64xi8__dispatch_0_matmul_64x6_0.aiecc.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_64x64_64xi8__dispatch_0_matmul_64x6_0.aiecc.mlir
index 635b03f8e..19ffa0664 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_64x64_64xi8__dispatch_0_matmul_64x6_0.aiecc.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_64x64_64xi8__dispatch_0_matmul_64x6_0.aiecc.mlir
@@ -19,12 +19,12 @@ aie.device(npu1_4col) {
   %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i8}
   %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i8}
   %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i8}
-  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<64x64xi8> 
-  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<64x64xi8> 
-  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<64x64xi32> 
-  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<8x16x4x8xi8> 
-  %buf1 = aie.buffer(%tile_0_2) {address = 5120 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<8x8x8x8xi8> 
-  %buf0 = aie.buffer(%tile_0_2) {address = 9216 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<8x16x4x8xi32> 
+  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<64x64xi8>
+  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<64x64xi8>
+  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<64x64xi32>
+  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<8x16x4x8xi8>
+  %buf1 = aie.buffer(%tile_0_2) {address = 5120 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<8x8x8x8xi8>
+  %buf0 = aie.buffer(%tile_0_2) {address = 9216 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<8x16x4x8xi32>
   %mem_0_2 = aie.mem(%tile_0_2) {
     %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
   ^bb1:  // 2 preds: ^bb0, ^bb1
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_8x32_16xi32__dispatch_0_matmul_8x32_0.aiecc.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_8x32_16xi32__dispatch_0_matmul_8x32_0.aiecc.mlir
index 5c1d56438..aab516db4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_8x32_16xi32__dispatch_0_matmul_8x32_0.aiecc.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/matmul_8x32_16xi32__dispatch_0_matmul_8x32_0.aiecc.mlir
@@ -19,12 +19,12 @@ aie.device(npu1_4col) {
   %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i8}
   %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i8}
   %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i8}
-  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<8x16xi32> 
-  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<16x32xi32> 
-  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<8x32xi32> 
-  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<2x2x4x8xi32> 
-  %buf1 = aie.buffer(%tile_0_2) {address = 1536 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<8x2x8x4xi32> 
-  %buf0 = aie.buffer(%tile_0_2) {address = 3584 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<8x2x4x4xi32> 
+  %buf5 = aie.buffer(%tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<8x16xi32>
+  %buf4 = aie.buffer(%tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf4"} : memref<16x32xi32>
+  %buf3 = aie.buffer(%tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<8x32xi32>
+  %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<2x2x4x8xi32>
+  %buf1 = aie.buffer(%tile_0_2) {address = 1536 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<8x2x8x4xi32>
+  %buf0 = aie.buffer(%tile_0_2) {address = 3584 : i32, mem_bank = 0 : i32, sym_name = "buf0"} : memref<8x2x4x4xi32>
   %mem_0_2 = aie.mem(%tile_0_2) {
     %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
   ^bb1:  // 2 preds: ^bb0, ^bb1
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/regenerate.sh b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/regenerate.sh
index 6d60c0b6f..7b8e269c4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/regenerate.sh
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/regenerate.sh
@@ -2,4 +2,4 @@
 
 for m in *.mlir; do
   aie_cdo_gen_test $m $PWD 2>&1 | sed -e 's/^/\/\/ CHECK: /' >> $m
-done
\ No newline at end of file
+done
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pad_pack.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pad_pack.mlir
index c4b0ed57c..740d116ab 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pad_pack.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_pad_pack.mlir
@@ -47,7 +47,7 @@ module attributes { transform.with_named_sequence } {
 
     // Fuse fill operation into the forall loop.
     %fused_fill, %fused_loop = transform.structured.fuse_into_containing_op %fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    
+
     // Pad operation.
     %padded, %pad, %__ = transform.structured.pad %tiled_matmul {
       padding_values=[0 : i32, 0 : i32, 0 : i32],
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_simple_pack.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_simple_pack.mlir
index 5c6815110..449e1a0b9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_simple_pack.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/transform_dialect/matmul_fill_spec_simple_pack.mlir
@@ -178,4 +178,3 @@ module attributes { transform.with_named_sequence } {
 //       CHECK:   memref.dealloc %{{.*}} : memref<1x1x64x512xi32, 1>
 //       CHECK:   memref.dealloc %{{.*}} : memref<1x1x64x64xi32, 1>
 //       CHECK: }
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeForallToFor.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeForallToFor.cpp
index 008bf4124..a7086ff4d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeForallToFor.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeForallToFor.cpp
@@ -32,7 +32,8 @@ LogicalResult forallToFor(RewriterBase &rewriter, Operation *op) {
       return WalkResult::advance();
     }
     if (failed(scf::forallToForLoop(rewriter, forallOp))) {
-      forallOp.emitOpError() << "was not transformed from `scf.forall` to `scf.for`";
+      forallOp.emitOpError()
+          << "was not transformed from `scf.forall` to `scf.for`";
       return WalkResult::interrupt();
     }
     return WalkResult::advance();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp
index 1120b3d1c..d5d845f3f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertCoreForallToFor.cpp
@@ -59,7 +59,7 @@ class AMDAIEConvertCoreForallToForPass
 
   AMDAIEConvertCoreForallToForPass() = default;
   AMDAIEConvertCoreForallToForPass(
-      const AMDAIEConvertCoreForallToForPass &pass) {};
+      const AMDAIEConvertCoreForallToForPass &pass){};
   void runOnOperation() override;
 };
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp
index 186d16542..8b97637cc 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp
@@ -440,7 +440,7 @@ class AMDAIECreateAIEWorkgroupPass
   }
 
   AMDAIECreateAIEWorkgroupPass() = default;
-  AMDAIECreateAIEWorkgroupPass(const AMDAIECreateAIEWorkgroupPass &pass) {};
+  AMDAIECreateAIEWorkgroupPass(const AMDAIECreateAIEWorkgroupPass &pass){};
   void runOnOperation() override;
 };
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
index 16cc9d0d0..5fdb22002 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
@@ -478,8 +478,8 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() {
   // possible.
   RewritePatternSet unrollLocalLoopsPatterns(context);
   unrollLocalLoopsPatterns.insert<AMDAIEUnrollLocalLoops>(context);
-  if (failed(applyPatternsGreedily(
-          moduleOp, std::move(unrollLocalLoopsPatterns)))) {
+  if (failed(applyPatternsGreedily(moduleOp,
+                                   std::move(unrollLocalLoopsPatterns)))) {
     moduleOp.emitOpError()
         << "loop unrolling of loops selected for parallel execution failed";
     return signalPassFailure();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
index d03112fca..906502b04 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp
@@ -513,7 +513,7 @@ struct SubsumeLoopIntoDMA
           npuCircularDmaOp.getSourceMemorySpaceAsUInt();
       std::optional<uint8_t> targetMemspaceInt =
           npuCircularDmaOp.getTargetMemorySpaceAsUInt();
-        if (!sourceMemspaceInt.has_value() || !targetMemspaceInt.has_value()) {
+      if (!sourceMemspaceInt.has_value() || !targetMemspaceInt.has_value()) {
         return rewriter.notifyMatchFailure(
             op, "Needs a memory space for both source and target");
       }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertLoopsForVectorization.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertLoopsForVectorization.cpp
index 8acc97f14..ace782ea7 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertLoopsForVectorization.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertLoopsForVectorization.cpp
@@ -101,10 +101,10 @@ class AMDAIEInsertLoopsForVectorizationPass
     return tileSizes;
   }
 
-  /// Collapse unit dims of the generic op before tiling for vectorization. Since
-  /// this is optinal we need not return failure if the collapsing cannot take
-  /// place. Eg: For <2x3x4> since there aren't any unit dimensions, it'd return
-  /// failure, hence we can simply return.
+  /// Collapse unit dims of the generic op before tiling for vectorization.
+  /// Since this is optinal we need not return failure if the collapsing cannot
+  /// take place. Eg: For <2x3x4> since there aren't any unit dimensions, it'd
+  /// return failure, hence we can simply return.
   void collapseUnitDims(IRRewriter &rewriter, linalg::GenericOp &genericOp) {
     linalg::ControlDropUnitDims options;
     options.rankReductionStrategy =
@@ -142,7 +142,8 @@ class AMDAIEInsertLoopsForVectorizationPass
       std::optional<SmallVector<int64_t>> tileSizes =
           formTileSizesForElementwise(genericOp);
       if (!tileSizes) {
-        return genericOp->emitOpError()<<"unable to form tile sizes for the elementwise op";
+        return genericOp->emitOpError()
+               << "unable to form tile sizes for the elementwise op";
       }
       performTiling(rewriter, genericOp, *tileSizes);
       return success();
@@ -209,7 +210,8 @@ class AMDAIEInsertLoopsForVectorizationPass
     std::optional<SmallVector<int64_t>> tileSizes =
         formTileSizesForMatmul(genericOp);
     if (!tileSizes) {
-      return genericOp->emitOpError()<<"unable to form tile sizes for the matmul op";
+      return genericOp->emitOpError()
+             << "unable to form tile sizes for the matmul op";
     }
     performTiling(rewriter, genericOp, *tileSizes);
     return success();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELinalgFunctionOutlining.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELinalgFunctionOutlining.cpp
index 57f0e1df4..20ba6df47 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELinalgFunctionOutlining.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELinalgFunctionOutlining.cpp
@@ -175,7 +175,6 @@ void AMDAIELinalgFunctionOutliningPass::runOnOperation() {
     if (failed(maybeFunc)) return WalkResult::interrupt();
     func::FuncOp func = maybeFunc.value();
 
-
     rewriter.setInsertionPoint(computeOp);
     rewriter.create<func::CallOp>(computeOp.getLoc(), func,
                                   computeOp->getOperands());
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadAlignmentReset.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadAlignmentReset.cpp
index 8f39e0aed..d64107463 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadAlignmentReset.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadAlignmentReset.cpp
@@ -4,10 +4,9 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "iree-amd-aie/IR/AMDAIEDialect.h"
 #include "iree-amd-aie/Transforms/Passes.h"
-#include "mlir/Pass/Pass.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Pass/Pass.h"
@@ -15,35 +14,34 @@
 
 namespace mlir::iree_compiler::AMDAIE {
 
-  using namespace mlir;
+using namespace mlir;
 
 namespace {
 
-// A pass which removes the alignment attribute from llvm load operations, 
+// A pass which removes the alignment attribute from llvm load operations,
 // if the alignment is less than 4 (2 or 1).
 //
 // Example. The pass replaces:
 //
 // ```
-//  %113 = llvm.load %112 {alignment = 2 : i64} 
+//  %113 = llvm.load %112 {alignment = 2 : i64}
 //                   : !llvm.ptr -> vector<32xbf16>
 // ```
 //
 // with
 //
 // ```
-//  %113 = llvm.load %112 
+//  %113 = llvm.load %112
 //                   : !llvm.ptr -> vector<32xbf16>
 // ```
 //
 // If this pass is not included in the matmul pipeline, there is an OOM error
 // later in the compilation. This is a temporary workaround while a better
-// solution is found: propagation of memref.assume_alignment is one option. 
+// solution is found: propagation of memref.assume_alignment is one option.
 // See also https://jira.xilinx.com/projects/AIECC/issues/AIECC-589
 
 class AMDAIELoadAlignmentReset
-    : public impl::AMDAIELoadAlignmentResetBase<
-          AMDAIELoadAlignmentReset> {
+    : public impl::AMDAIELoadAlignmentResetBase<AMDAIELoadAlignmentReset> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<AMDAIEDialect>();
   }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToUKernels.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToUKernels.cpp
index 30b78d107..d38c161e6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToUKernels.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToUKernels.cpp
@@ -276,8 +276,7 @@ void AMDAIELowerToUKernelsPass::runOnOperation() {
                                                            pathToUkernels);
   patterns.insert<LowerToUKernelPattern<linalg::FillOp>>(context, allTargets,
                                                          pathToUkernels);
-  if (failed(
-          applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
     return signalPassFailure();
   }
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp
index fea7cdcd6..d0c8f4023 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESinkIntoCore.cpp
@@ -89,8 +89,7 @@ class AMDAIESinkIntoCorePass
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
     patterns.insert<SinkingPattern>(&getContext());
-    if (failed(applyPatternsGreedily(getOperation(),
-                                            std::move(patterns)))) {
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/BridgeToAIRPass.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/BridgeToAIRPass.cpp
index 824070eb9..4d4b06d63 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/BridgeToAIRPass.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/BridgeToAIRPass.cpp
@@ -102,8 +102,7 @@ void AMDAIEBridgeToAIRPass::runOnOperation() {
   patterns
       .insert<LinalgCopyToMemRefCopy, SCFForAllToParallelOp, AffineApplyOnSym>(
           context);
-  if (failed(
-          applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
     return signalPassFailure();
   }
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp
index edff2a7ea..4c368a868 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp
@@ -316,8 +316,7 @@ void AMDAIEDecomposeLinalgExtPackUnPackToAIRPass::runOnOperation() {
   // Second-stage lowering of pack and unpack ops.
   RewritePatternSet patterns(ctx);
   patterns.add<LowerPackPattern, LowerUnPackPattern>(ctx);
-  if (failed(
-          applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
     return signalPassFailure();
   }
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h
index 0a9f6ecfa..c8c50643f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h
@@ -14,11 +14,7 @@
 namespace mlir::iree_compiler::AMDAIE {
 
 /// Enum for AIE lowering pipelines to pick.
-enum class LowerToAIEPassPipeline {
-  AIR,
-  ObjectFifo,
-  None
-};
+enum class LowerToAIEPassPipeline { AIR, ObjectFifo, None };
 
 /// Enum for tiling pass pipelines to pick. Because of how the pass-pipeline
 /// enums are implemented using tablegen in IREE, it isnt extensible.
@@ -35,7 +31,12 @@ enum class TilePassPipeline {
 enum class PeelingType { First, Last, FirstLast };
 
 /// Enum for operands to be bufferized to allocation.
-enum class BufferizeOperand { LinalgInputOutput, LinalgInput, LinalgOutput, PackInput };
+enum class BufferizeOperand {
+  LinalgInputOutput,
+  LinalgInput,
+  LinalgOutput,
+  PackInput
+};
 
 /// Enum for hardware mapping attributes.
 enum class HardwareMapping {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEUtilsTest.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEUtilsTest.cpp
index d44fb86ee..218245d9d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEUtilsTest.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEUtilsTest.cpp
@@ -45,7 +45,6 @@ TEST(FindLargestFactorTest, Test0) {
 }  // namespace
 
 int main(int argc, char **argv) {
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir
index d8d45da0f..ee592cada 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir
@@ -395,4 +395,3 @@ func.func @epilogue_write_with_preceding_none_accesses(
   }
   return
 }
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir
index 785a20ff7..a76809a60 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir
@@ -242,7 +242,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Expect two DMA copy operations at the innermost loop have BD IDs as expressions. #map0: 1~15, #map1: 0~15 
+// Expect two DMA copy operations at the innermost loop have BD IDs as expressions. #map0: 1~15, #map1: 0~15
 
 // CHECK: #map = affine_map<(d0) -> (d0 mod 15 + 1)>
 // CHECK: #map1 = affine_map<(d0) -> (d0 mod 16)>
@@ -389,7 +389,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
           amdaie.npu.dma_wait(%0 : !amdaie.async_source_token)
           scf.for %arg5 = %c0 to %c2 step %c1 {
             %1 = amdaie.npu.dma_cpy_nd async_target %connection_1(%from_memref_2[] [] [], [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-            %2 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>            
+            %2 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
             amdaie.npu.dma_wait(%1 : !amdaie.async_target_token)
             amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
           }
@@ -462,10 +462,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
           %0 = amdaie.npu.dma_cpy_nd async_source %connection_0([] [] [], %from_memref_1[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
           scf.for %arg5 = %c0 to %c2 step %c1 {
             %1 = amdaie.npu.dma_cpy_nd async_target %connection_1(%from_memref_2[] [] [], [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
-            %2 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>            
+            %2 = amdaie.npu.dma_cpy_nd async_source %connection_2([] [] [], %from_memref_3[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
             amdaie.npu.dma_wait(%1 : !amdaie.async_target_token)
             amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
-          }          
+          }
           amdaie.npu.dma_wait(%0 : !amdaie.async_source_token)
         }
         amdaie.end
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir
index 0b0950f90..0073f7ecf 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir
@@ -473,7 +473,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // DMA ordering checks
 //===----------------------------------------------------------------------===//
 
-// We combine across wait operations, which should be ok as no other actor should 
+// We combine across wait operations, which should be ok as no other actor should
 // touch the circular DMA in between. Therefore, the wait can be removed.
 // CHECK-LABEL: @wait_after_first
 // CHECK:       %[[CONNECTION:.+]] = amdaie.connection
@@ -568,7 +568,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 //===----------------------------------------------------------------------===//
 // npu.circular_dma_cpy_nd
-// Note: only a few checks as most logic is the same for 
+// Note: only a few checks as most logic is the same for
 // `npu.circular_dma_cpy_nd` and `npu.dma_cpy_nd`.
 //===----------------------------------------------------------------------===//
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
index 74150676a..638b54182 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
@@ -181,7 +181,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Expect four `push_to_queue` operations on the same `row`, `direction`, and `channel` 
+// Expect four `push_to_queue` operations on the same `row`, `direction`, and `channel`
 // but with different `col` values. The order of the `col` values is 0, 3, 2, 1.
 // After sorting the `col` values, the batched `dma_wait` operation will be converted to
 // a single `tct_sync` operation, with the `col` set to 0 and `col_num` set to 4.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma_failures.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma_failures.mlir
index 992ae4190..d541aa8ef 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma_failures.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma_failures.mlir
@@ -30,4 +30,3 @@ func.func @failure_case() {
   }
   return
 }
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir
index bdfbd4f26..8f206f912 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir
@@ -355,7 +355,7 @@ func.func @forall_dmas(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>)
 // CHECK-DAG:       %[[C8_1:.+]] = arith.constant 8 : index
 // CHECK-DAG:       %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]]
 // CHECK:           amdaie.npu.circular_dma_cpy_nd %[[CONNECTION]]([] [] [], [] [] [])
-// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CONNECTION]]([] [] [], %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) 
+// CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CONNECTION]]([] [] [], %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]] : !amdaie.async_source_token)
 // CHECK:           scf.for %{{.*}} = %[[C0_1]] to %[[C8_1]] step %[[C1_1]] {
 // CHECK:             amdaie.npu.circular_dma_cpy_nd %[[CONNECTION2]]([] [] [], [] [] [])
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
index d570d2624..bcb517e08 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
@@ -486,7 +486,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Don't subsume if inter size (dim 0 in a four dimensional size array) or intra size 
+// Don't subsume if inter size (dim 0 in a four dimensional size array) or intra size
 // (dim 1, 2, 3 in a four dimensional size array) is too large.
 // CHECK-LABEL: @exceed_max_size_source
 // CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -530,7 +530,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Don't subsume if inter size (dim 0 in a four dimensional size array) or intra size 
+// Don't subsume if inter size (dim 0 in a four dimensional size array) or intra size
 // (dim 1, 2, 3 in a four dimensional size array) is too large.
 // CHECK-LABEL: @exceed_max_size_target
 // CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -573,7 +573,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Don't subsume if inter stride (dim 0 in a four dimensional size array) or intra stride 
+// Don't subsume if inter stride (dim 0 in a four dimensional size array) or intra stride
 // (dim 1, 2, 3 in a four dimensional size array) is too large.
 // CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 1048577)>
 // CHECK-LABEL: @exceed_max_stride_source
@@ -628,7 +628,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Don't subsume if inter stride (dim 0 in a four dimensional size array) or intra stride 
+// Don't subsume if inter stride (dim 0 in a four dimensional size array) or intra stride
 // (dim 1, 2, 3 in a four dimensional size array) is too large.
 // CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 1048577)>
 // CHECK-LABEL: @exceed_max_stride_target
@@ -684,7 +684,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // -----
 
 //===----------------------------------------------------------------------===//
-// Checks for loops with no dependencies, which should be subsumed. 
+// Checks for loops with no dependencies, which should be subsumed.
 //===----------------------------------------------------------------------===//
 
 // Subsume loop iteration into strided op without dependency.
@@ -1161,7 +1161,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 }
 
 //===----------------------------------------------------------------------===//
-// Checks for dependencies via induction variables (no affine.apply) on both 
+// Checks for dependencies via induction variables (no affine.apply) on both
 // source and target sides.
 //===----------------------------------------------------------------------===//
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir
index 2025e0f17..d27ea92bf 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption_circular.mlir
@@ -147,7 +147,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Ensure no subsumption happens in case of other circular connection users in nested blocks. 
+// Ensure no subsumption happens in case of other circular connection users in nested blocks.
 // The innermost block contains two `amdaie.npu.circular_dma_cpy_nd` to avoid them being subsumed as well.
 // CHECK-LABEL: @nested_blockers
 // CHECK:       %[[CONNECTION:.+]] = amdaie.connection
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
index f74b8bad6..6ca71d325 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
@@ -131,7 +131,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // -----
 
 // Same connection, but different BD IDs are used. Expect the DMA waits to be folded.
-// DMA queue has a maximum size of 4. To optimize, starting from the end of the control code, 
+// DMA queue has a maximum size of 4. To optimize, starting from the end of the control code,
 // retain every 4th DMA wait operation, while folding the others and removing their tokens.
 // CHECK-LABEL: @fold_dma_waits_max_queue_size
 // CHECK:       %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers
@@ -442,7 +442,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
       %channel_7 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = MM2S)
       %channel_8 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
-      %channel_9 = amdaie.channel(%tile, 1, port_type = DMA, direction = S2MM) 
+      %channel_9 = amdaie.channel(%tile, 1, port_type = DMA, direction = S2MM)
       %6 = amdaie.flow({%channel} -> {%channel_7}) {is_packet_flow = false}
       %7 = amdaie.flow({%channel_8} -> {%channel_9}) {is_packet_flow = false}
       %8 = amdaie.connection(%0 {%channel_7}, %2 {%channel}, flow = %6) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir
index 46ed238d5..33470fac3 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir
@@ -106,7 +106,7 @@ module {
 // CHECK-SAME:              ins(%[[MATMUL]], %[[OPERAND2]] :
 // CHECK-SAME:              outs(%[[OPERAND3]] :
 // CHECK:                 {
-// CHECK:                   arith.addi  
+// CHECK:                   arith.addi
 // CHECK:                 }
 // CHECK-DAG:             %[[YIELD_MATMUL:.*]] = tensor.insert_slice %[[MATMUL]] into %[[ITER_ARG_1]]
 // CHECK-DAG:             %[[YIELD_ELEM:.*]] = tensor.insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]]
@@ -300,13 +300,13 @@ module {
 // CHECK:         %[[SECOND_LOOP:.*]]:3 = scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) shared_outs(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_2:.*]] = %[[ELEM_OUT]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]])
 // CHECK-SAME:    {
 // CHECK:           %[[MATMUL:.*]] = linalg.generic
-// CHECK:           %[[OPERAND2:.*]] = tensor.extract_slice %[[ELEM_OUT]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] 
+// CHECK:           %[[OPERAND2:.*]] = tensor.extract_slice %[[ELEM_OUT]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
 // CHECK:           %[[OPERAND3:.*]] = tensor.extract_slice %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
 // CHECK:           %[[FUSED_CONSUMER:.*]] = linalg.generic
 // CHECK-SAME:        ins(%[[MATMUL]], %[[OPERAND2]] :
 // CHECK-SAME:        outs(%[[OPERAND3]] :
 // CHECK:           {
-// CHECK:             arith.addi  
+// CHECK:             arith.addi
 // CHECK:           }
 // CHECK-DAG:       %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
 // CHECK-DAG:       %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]])
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_pack_into_loop.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_pack_into_loop.mlir
index 3872aaf55..dba836d11 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_pack_into_loop.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_pack_into_loop.mlir
@@ -261,6 +261,3 @@ func.func @pack_without_slice(%arg0: tensor<1x1x32x512xi32>, %arg1: tensor<1x1x3
 // DEPTH-1-DAG:   %[[PACK_2:.*]] = tensor.pack %{{.*}} into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
 // DEPTH-1:       linalg.generic
 // DEPTH-1-SAME:  ins(%[[PACK_2]], %[[PACK_1]]
-
-
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir
index daf8ca6db..c5857e7ad 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir
@@ -15,8 +15,8 @@ module {
 
 // -----
 
-// Shim tile (0, 0) has two producer (MM2S) channels, 
-// both of which are already utilized by existing circuit flows. 
+// Shim tile (0, 0) has two producer (MM2S) channels,
+// both of which are already utilized by existing circuit flows.
 // No producer DMA channel is available for route-shim-to-tile-ctrl.
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
@@ -44,7 +44,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Successfully inserted six packet flows from shim DMA channels to tile CTRL channels, 
+// Successfully inserted six packet flows from shim DMA channels to tile CTRL channels,
 // and one circuit flow from shim CTRL to shim SOUTH 0.
 // CHECK-LABEL: @column_control_overlay
 // CHECK:    %[[C0:.*]] = arith.constant 0 : index
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir
index b3e85ab1f..6b35c341b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_dma_bd_chain.mlir
@@ -223,7 +223,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:         %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_memref
 // CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]] next_bd = %[[BD_ID_2]] start_bd = %[[BD_ID_0]])
 // CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]] next_bd = %[[BD_ID_3]] start_bd = %[[BD_ID_1]])
-// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]] start_bd = %[[BD_ID_0]]) 
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_0]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]] start_bd = %[[BD_ID_0]])
 // CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_1]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]] start_bd = %[[BD_ID_1]])
 // CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
 // CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_infinite_loop_around_core_block.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_infinite_loop_around_core_block.mlir
index 2c7d7e228..641ea7c81 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_infinite_loop_around_core_block.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_infinite_loop_around_core_block.mlir
@@ -4,7 +4,7 @@
 // CHECK:       %[[TILE:.+]] = amdaie.tile
 // CHECK:       amdaie.core(%[[TILE]], in : [], out : []) {
 // CHECK-NOT:     scf.while
-// CHECK-NOT:     scf.for 
+// CHECK-NOT:     scf.for
 // CHECK:         amdaie.end
 // CHECK:       }
 func.func @empty_core() {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_loops_for_vectorization.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_loops_for_vectorization.mlir
index 08bb085f7..d98755ddd 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_loops_for_vectorization.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_loops_for_vectorization.mlir
@@ -392,7 +392,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     // COLLAPSE:            linalg.generic
     // COLLAPSE-SAME:           ins(%[[SLICE_0]], %[[SLICE_1]] :
     // COLLAPSE-SAME:           outs(%[[SLICE_2]] :
-    
+
     // COALESCE:         scf.for
     // COALESCE-NOT:     scf.for
     // COALESCE:           memref.subview %[[ARG0]]
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/linalg_function_outlining.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/linalg_function_outlining.mlir
index fcde7d22f..99cbb2b69 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/linalg_function_outlining.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/linalg_function_outlining.mlir
@@ -211,4 +211,3 @@ func.func @unoutlineable_strided_layout(%A: memref<4x8xbf16, strided<[9,1]>>, %B
   }
   return
 }
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/localize_logical_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/localize_logical_objectfifo.mlir
index 9bf60ab9b..54de5922e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/localize_logical_objectfifo.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/localize_logical_objectfifo.mlir
@@ -30,4 +30,3 @@ module {
     return
   }
 }
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_func_args.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_func_args.mlir
index c3400708c..fcda1dec8 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_func_args.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_func_args.mlir
@@ -22,4 +22,3 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     return
   }
 }
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
index 998af34db..95f023c86 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
@@ -365,7 +365,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %lock_5 = amdaie.lock(%tile_0_1(1), 0)
       %buffer_3 = amdaie.buffer(%tile_0_2) : memref<2048xi32, 2 : i32>
       %lock_6 = amdaie.lock(%tile_0_2(0), 1)
-      %lock_7 = amdaie.lock(%tile_0_2(1), 0) 
+      %lock_7 = amdaie.lock(%tile_0_2(1), 0)
       %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
       %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>
       %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
@@ -625,11 +625,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // -----
 
 // Tests lowering of a circular DMA operation to a DMA chain.
-// Checks that a circular DMA operation with an 'outer' repetition which is not 
-// part of the objectFifo's repetition count (same repetition on each 
+// Checks that a circular DMA operation with an 'outer' repetition which is not
+// part of the objectFifo's repetition count (same repetition on each
 // connection), is lowered to a chain of `dma_bd` operations with a lock
 // acquire at the beginning of the chain and a lock release at the end. Note
-// that this lowering to multiple `dma_bd` operations is needed because 
+// that this lowering to multiple `dma_bd` operations is needed because
 // `stride == 0` is not supported in hardware and/or because there are more
 // dimensions needed than supported in `dma_bd`.
 // CHECK:     aie.device(npu1_4col)
@@ -695,10 +695,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // -----
 
 // Tests lowering of a circular DMA operation to a DMA chain.
-// Checks that a circular DMA operation with an 'inner' repetition (a dimension 
-// with `stride == 0` after a dimension with `stride != 0`), is lowered to a 
+// Checks that a circular DMA operation with an 'inner' repetition (a dimension
+// with `stride == 0` after a dimension with `stride != 0`), is lowered to a
 // chain of `dma_bd` operations with a lock acquire at the beginning of the chain
-// and a lock release at the end. Note that this lowering to multiple `dma_bd` 
+// and a lock release at the end. Note that this lowering to multiple `dma_bd`
 // operations is needed because `stride == 0` is not supported in hardware and/or
 // because there are more dimensions needed than supported in `dma_bd`.
 // CHECK:     aie.device(npu1_4col)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_workgroup_count.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_workgroup_count.mlir
index 97ae7b53f..b5bc789c9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_workgroup_count.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_workgroup_count.mlir
@@ -3,7 +3,7 @@ hal.executable private @test {
   hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd"}>) {
     hal.executable.export public @test_export ordinal(0) layout(#hal.pipeline.layout<bindings = [<storage_buffer, ReadOnly>]>) {
     ^bb0(%arg0: !hal.device):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
       hal.return %x, %y, %z : index, index, index
     }
     builtin.module {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir
index 93445cd35..6b6718808 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir
@@ -19,7 +19,7 @@ builtin.module {
     %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2048xi64>> -> tensor<2048x2048xi64>
     %5 = tensor.empty() : tensor<2048x2048xi64>
     %6 = linalg.fill ins(%c0_i64 : i64) outs(%5 : tensor<2048x2048xi64>) -> tensor<2048x2048xi64>
-    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig} 
+    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig}
     %7 = linalg.matmul ins(%3, %4 : tensor<2048x2048xi64>, tensor<2048x2048xi64>) outs(%6 : tensor<2048x2048xi64>) -> tensor<2048x2048xi64>
     flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi64> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi64>>
     return
@@ -46,7 +46,7 @@ builtin.module {
     %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2048xi32>> -> tensor<2048x2048xi32>
     %5 = tensor.empty() : tensor<2048x2048xi32>
     %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
-    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig} 
+    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig}
     %7 = linalg.matmul ins(%3, %4 : tensor<2048x2048xi32>, tensor<2048x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
     flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
     return
@@ -73,7 +73,7 @@ builtin.module {
     %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2048xbf16>> -> tensor<2048x2048xbf16>
     %5 = tensor.empty() : tensor<2048x2048xbf16>
     %6 = linalg.fill ins(%c0_bf16 : bf16) outs(%5 : tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16>
-    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig} 
+    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig}
     %7 = linalg.matmul ins(%3, %4 : tensor<2048x2048xbf16>, tensor<2048x2048xbf16>) outs(%6 : tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16>
     flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xbf16> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xbf16>>
     return
@@ -100,7 +100,7 @@ builtin.module {
     %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16x32xi64>> -> tensor<16x32xi64>
     %5 = tensor.empty() : tensor<8x32xi64>
     %6 = linalg.fill ins(%c0_i64 : i64) outs(%5 : tensor<8x32xi64>) -> tensor<8x32xi64>
-    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig} 
+    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig}
     %7 = linalg.matmul ins(%3, %4 : tensor<8x16xi64>, tensor<16x32xi64>) outs(%6 : tensor<8x32xi64>) -> tensor<8x32xi64>
     flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi64> -> !flow.dispatch.tensor<writeonly:tensor<8x32xi64>>
     return
@@ -127,7 +127,7 @@ builtin.module {
     %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16x32xi32>> -> tensor<16x32xi32>
     %5 = tensor.empty() : tensor<8x32xi32>
     %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<8x32xi32>) -> tensor<8x32xi32>
-    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig} 
+    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig}
     %7 = linalg.matmul ins(%3, %4 : tensor<8x16xi32>, tensor<16x32xi32>) outs(%6 : tensor<8x32xi32>) -> tensor<8x32xi32>
     flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<8x32xi32>>
     return
@@ -154,7 +154,7 @@ builtin.module {
     %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16x32xbf16>> -> tensor<16x32xbf16>
     %5 = tensor.empty() : tensor<8x32xbf16>
     %6 = linalg.fill ins(%c0_bf16 : bf16) outs(%5 : tensor<8x32xbf16>) -> tensor<8x32xbf16>
-    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig} 
+    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig}
     %7 = linalg.matmul ins(%3, %4 : tensor<8x16xbf16>, tensor<16x32xbf16>) outs(%6 : tensor<8x32xbf16>) -> tensor<8x32xbf16>
     flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [8, 32], strides = [1, 1] : tensor<8x32xbf16> -> !flow.dispatch.tensor<writeonly:tensor<8x32xbf16>>
     return
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_failures.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_failures.mlir
index f6afb3896..9062eb565 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_failures.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_failures.mlir
@@ -27,5 +27,3 @@ builtin.module {
     return
   }
 }
-
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir
index f266e861d..815127a5f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir
@@ -15,7 +15,7 @@ func.func @func0() {
 // CHECK-LABEL: @func1
 // CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<1x1x8x16xi32, 1>
 // CHECK: %[[ALLOC1:.*]] = memref.alloc() : memref<1x1x2x2x4x8xi32, 2>
-// CHECK: %[[EXPANDSHAPE0:.*]] = memref.expand_shape %[[ALLOC0]] 
+// CHECK: %[[EXPANDSHAPE0:.*]] = memref.expand_shape %[[ALLOC0]]
 // CHECK-SAME: output_shape [1, 1, 2, 4, 2, 8] : memref<1x1x8x16xi32, 1> into memref<1x1x2x4x2x8xi32, 1>
 // CHECK: %[[TRANSPOSE0:.*]] = memref.transpose %[[EXPANDSHAPE0]] (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x2x4x2x8xi32, 1> to memref<1x1x2x2x4x8xi32, strided<[128, 128, 8, 64, 16, 1]>, 1>
 // CHECK: air.dma_memcpy_nd (%[[ALLOC1]][] [] [], %[[TRANSPOSE0]][] [] []) : (memref<1x1x2x2x4x8xi32, 2>, memref<1x1x2x2x4x8xi32, strided<[128, 128, 8, 64, 16, 1]>, 1>)
@@ -28,7 +28,7 @@ func.func @func1() {
 
 // CHECK-LABEL: @func2
 // CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<32x8x8xf32>
-// CHECK: scf.parallel (%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) = 
+// CHECK: scf.parallel (%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) =
 // CHECK: %[[SUBVIEW0:.*]] = memref.subview %[[ALLOC0]][%[[ARG0]], %[[ARG1]], 0] [1, 8, 8] [1, 1, 1] : memref<32x8x8xf32> to memref<1x8x8xf32, strided<[64, 8, 1], offset: ?>>
 // CHECK: %[[ALLOC1:.*]] = memref.alloc() : memref<1x1x1x8x8xf32, 1>
 // CHECK: air.dma_memcpy_nd (%[[ALLOC1]][] [] [], %[[SUBVIEW0]][] [] []) : (memref<1x1x1x8x8xf32, 1>, memref<1x8x8xf32, strided<[64, 8, 1], offset: ?>>)
@@ -75,7 +75,7 @@ func.func @func4() {
 
 // CHECK-LABEL: @func5
 // CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<32x8x64xf32>
-// CHECK: scf.parallel (%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) = 
+// CHECK: scf.parallel (%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) =
 // CHECK: %[[SUBVIEW0:.*]] = memref.subview %[[ALLOC0]][%[[ARG0]], %[[ARG1]], %[[ARG2]]] [1, 8, 64] [1, 1, 1] : memref<32x8x64xf32> to memref<1x8x64xf32, strided<[512, 64, 1], offset: ?>>
 // CHECK: %[[ALLOC1:.*]] = memref.alloc() : memref<1x1x1x8x64xf32, 1>
 // CHECK: %[[SUBVIEW1:.*]] = memref.subview %[[ALLOC1]][0, 0, 0, 0, 0] [1, 1, 1, 8, 64] [1, 1, 1, 1, 1] : memref<1x1x1x8x64xf32, 1> to memref<1x8x64xf32, strided<[512, 64, 1]>, 1>
@@ -135,13 +135,13 @@ func.func @func6() {
       // CHECK: %[[SUBVIEW5:.*]] = memref.subview %{{.*}}[%{{.*}}, 0, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<1x1x8x16xi32, strided<[128, 128, 16, 1], offset: ?>, 1>
       // CHECK: %[[SUBVIEW6:.*]] = memref.subview %{{.*}}[0, %{{.*}}, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<1x1x16x16xi32, 1> to memref<1x1x16x16xi32, strided<[256, 256, 16, 1], offset: ?>, 1>
       // CHECK: %[[SUBVIEW7:.*]] = memref.subview %{{.*}}[%{{.*}}, %{{.*}}, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<1x1x8x16xi32, strided<[128, 128, 16, 1], offset: ?>, 1>
-      // CHECK: %[[EXPANDSHAPE0:.*]] = memref.expand_shape %[[SUBVIEW5]] 
+      // CHECK: %[[EXPANDSHAPE0:.*]] = memref.expand_shape %[[SUBVIEW5]]
       // CHECK-SAME: output_shape [1, 1, 2, 4, 2, 8] : memref<1x1x8x16xi32, strided<[128, 128, 16, 1], offset: ?>, 1> into memref<1x1x2x4x2x8xi32, strided<[128, 128, 64, 16, 8, 1], offset: ?>, 1>
       // CHECK: %[[TRANSPOSE2:.*]] = memref.transpose %[[EXPANDSHAPE0]] (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x2x4x2x8xi32, strided<[128, 128, 64, 16, 8, 1], offset: ?>, 1> to memref<1x1x2x2x4x8xi32, strided<[128, 128, 8, 64, 16, 1], offset: ?>, 1>
       // CHECK: air.dma_memcpy_nd (%{{.*}}[] [] [], %[[TRANSPOSE2]][] [] []) : (memref<1x1x2x2x4x8xi32, 2>, memref<1x1x2x2x4x8xi32, strided<[128, 128, 8, 64, 16, 1], offset: ?>, 1>)
       iree_linalg_ext.pack %subview_4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x8x16xi32, strided<[128, 128, 16, 1], offset: ?>, 1> memref<1x1x2x2x4x8xi32, 2>)
       %alloc_8 = memref.alloc() : memref<1x1x2x2x8x8xi32, 2>
-      // CHECK: %[[EXPANDSHAPE1:.*]] = memref.expand_shape %[[SUBVIEW6]] 
+      // CHECK: %[[EXPANDSHAPE1:.*]] = memref.expand_shape %[[SUBVIEW6]]
       // CHECK-SAME: output_shape [1, 1, 2, 8, 2, 8] : memref<1x1x16x16xi32, strided<[256, 256, 16, 1], offset: ?>, 1> into memref<1x1x2x8x2x8xi32, strided<[256, 256, 128, 16, 8, 1], offset: ?>, 1>
       // CHECK: %[[TRANSPOSE3:.*]] = memref.transpose %[[EXPANDSHAPE1]] (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x2x8x2x8xi32, strided<[256, 256, 128, 16, 8, 1], offset: ?>, 1> to memref<1x1x2x2x8x8xi32, strided<[256, 256, 8, 128, 16, 1], offset: ?>, 1>
       // CHECK: air.dma_memcpy_nd (%{{.*}}[] [] [], %[[TRANSPOSE3]][] [] []) : (memref<1x1x2x2x8x8xi32, 2>, memref<1x1x2x2x8x8xi32, strided<[256, 256, 8, 128, 16, 1], offset: ?>, 1>)
@@ -160,7 +160,7 @@ func.func @func6() {
       memref.dealloc %alloc_7 : memref<1x1x2x2x4x8xi32, 2>
       memref.dealloc %alloc_8 : memref<1x1x2x2x8x8xi32, 2>
       memref.dealloc %alloc_9 : memref<1x1x2x2x4x8xi32, 2>
-      scf.reduce 
+      scf.reduce
     }
     // CHECK: %[[SUBVIEW8:.*]] = memref.subview %{{.*}}[0, 0, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<8x16xi32, strided<[16, 1]>, 1>
     // CHECK: air.dma_memcpy_nd (%[[SUBVIEW2]][] [] [], %[[SUBVIEW8]][] [] []) : (memref<8x16xi32, strided<[32, 1], offset: ?>>, memref<8x16xi32, strided<[16, 1]>, 1>)
@@ -168,7 +168,7 @@ func.func @func6() {
     memref.dealloc %alloc_2 : memref<1x1x16x16xi32, 1>
     memref.dealloc %alloc : memref<1x1x8x16xi32, 1>
     memref.dealloc %alloc_3 : memref<1x1x8x16xi32, 1>
-    scf.reduce 
+    scf.reduce
   }
   return
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir
index f10ff8444..2f8da107d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/sink_into_core.mlir
@@ -105,4 +105,3 @@ module {
     return
   }
 }
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
index 40f38621e..ceee494eb 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
@@ -297,15 +297,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 // Tests splitting a producer DMA with the consumer DMAs' offsets depending on a loop induction variable through an affine expression with a scale/stride.
 // This results in a splitting factor that is different from the size of the dimension being split and more complex splitting along the stride.
 // For example, if the data in a 4x4 objectFifo at some point is:
-// 
+//
 // [0, 0, 0, 0]
 // [1, 1, 1, 1]
 // [2, 2, 2, 2]
 // [3, 3, 3, 3]
-// 
+//
 // and for an `index` from 0 -> 2, two consumer DMAs access the following rows:
-// 
-// consumer 1: 2 * `index`  (thus rows 0 and 2) 
+//
+// consumer 1: 2 * `index`  (thus rows 0 and 2)
 // consumer 2: 2 * `index` + 1  (thus rows 1 and 3)
 //
 // Therefore, the objectFifo is split into two objectFifos in the following way:
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir
index 41211aa6a..8cb2c0d81 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos_for_connection_reuse.mlir
@@ -86,7 +86,7 @@ module {
 //       CHECK:       amdaie.dma_cpy_nd
 //       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
 //       CHECK:         linalg.generic
@@ -242,7 +242,7 @@ module {
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
 //       CHECK:         linalg.generic
@@ -254,7 +254,7 @@ module {
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
 //       CHECK:         linalg.generic
@@ -262,7 +262,7 @@ module {
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                          %[[L2_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
 //       CHECK:         linalg.generic
@@ -270,7 +270,7 @@ module {
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                          %[[L2_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out :
 //       CHECK:         linalg.generic
@@ -474,7 +474,7 @@ module {
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
 //       CHECK:         linalg.generic
@@ -486,7 +486,7 @@ module {
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
 //       CHECK:         linalg.generic
@@ -494,7 +494,7 @@ module {
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                          %[[L2_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
 //       CHECK:         linalg.generic
@@ -502,7 +502,7 @@ module {
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                          %[[L1_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                          %[[L2_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out :
 //       CHECK:         linalg.generic
@@ -682,7 +682,7 @@ module {
 //       CHECK:       amdaie.dma_cpy_nd
 //       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
 //       CHECK:         linalg.generic
@@ -817,7 +817,7 @@ module {
 //       CHECK:       amdaie.dma_cpy_nd
 //       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
 //       CHECK:         linalg.generic
@@ -951,7 +951,7 @@ module {
 //       CHECK:       amdaie.dma_cpy_nd
 //       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
-//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
+//  CHECK-SAME:                                         %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
 //  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
 //       CHECK:         linalg.generic
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_convolution_using_scf_forall.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_convolution_using_scf_forall.mlir
index f769ee92d..e904cf8f1 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_convolution_using_scf_forall.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_convolution_using_scf_forall.mlir
@@ -125,5 +125,3 @@ module {
 
 
 // -----
-
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_copy_using_scf_for.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_copy_using_scf_for.mlir
index fecca8519..3434520ab 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_copy_using_scf_for.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_copy_using_scf_for.mlir
@@ -28,4 +28,3 @@ func.func @matmul_example(%arg0 : tensor<64x2048xi32>, %arg1 : tensor<2048x64xi3
 // CHECK:       linalg.matmul
 // CHECK:       linalg.copy
 // CHECK:   }
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/unsupported_pipelines.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/unsupported_pipelines.mlir
index 2c45140b9..aee1d294b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/unsupported_pipelines.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/unsupported_pipelines.mlir
@@ -24,5 +24,3 @@ builtin.module {
     return
   }
 }
-
-
diff --git a/experimental/delegate/README.md b/experimental/delegate/README.md
index 5536e49b9..ec8931854 100644
--- a/experimental/delegate/README.md
+++ b/experimental/delegate/README.md
@@ -138,7 +138,7 @@ Recompile IREE if you have made any code changes.
 
 ```
 iree-compile --iree-preprocessing-pdl-spec-filename=opt.pdl.mlir matmul.mlir -o matmul.vmfb
- 
+
 iree-run-module --device=local-sync --executable_plugin=$PATH_TO_DELEGATE --module=matmul.vmfb --function=mlp_invocation --input="1x8x768xbf16=2" --input="1x768x768xbf16=3"
 ```
 ### Compililng and running demo 3 (OPT)
@@ -165,7 +165,7 @@ Recompile IREE if you have made any code changes.
 
 ```
 iree-compile large-matmul.mlir -o large-matmul.vmfb --iree-preprocessing-pdl-spec-filename=large-matmul.pdl.mlir
- 
+
 iree-run-module --device=local-sync --executable_plugin=$PATH_TO_DELEGATE --module=large-matmul.vmfb --function=mlp_invocation --input="8192x2432xbf16=2" --input="2432x9728xbf16=3"
 ```
 
@@ -182,7 +182,7 @@ Recompile IREE if you have made any code changes.
 
 ```
 iree-compile large-matmul-f32.mlir -o large-matmul-f32.vmfb --iree-preprocessing-pdl-spec-filename=large-matmul-f32.pdl.mlir
- 
+
 iree-run-module --device=local-sync --executable_plugin=$PATH_TO_DELEGATE --module=large-matmul-f32.vmfb --function=mlp_invocation --input="8192x2432xf32=2" --input="2432x9728xf32=3"
 ```
 
diff --git a/experimental/delegate/large-matmul-f32.mlir b/experimental/delegate/large-matmul-f32.mlir
index a2839c91a..3a9ebd2c3 100644
--- a/experimental/delegate/large-matmul-f32.mlir
+++ b/experimental/delegate/large-matmul-f32.mlir
@@ -62,4 +62,3 @@ module @example attributes {hal.device.targets = [#cpu_target]} {
 // Check that the matmul has been replaced with a call to the external function with the right types
 // TRANSFORM: util.func public @mlp_invocation({{.+}}) -> {{.+}} {
 // TRANSFORM: flow.dispatch @executable::@mlp({{.+}}) : (tensor<8192x2432xf32>, tensor<2432x9728xf32>, i32, i32, i32) -> tensor<8192x9728xf32>
-
diff --git a/experimental/delegate/large-matmul-f32.pdl.mlir b/experimental/delegate/large-matmul-f32.pdl.mlir
index 82277773f..ca1ee3c65 100644
--- a/experimental/delegate/large-matmul-f32.pdl.mlir
+++ b/experimental/delegate/large-matmul-f32.pdl.mlir
@@ -52,7 +52,7 @@ pdl.pattern @mlp : benefit(1) {
   %lhs = pdl.operand
   %rhs = pdl.operand
   %empty = pdl.operand
-  
+
   %lhs_type = pdl.type : tensor<8192x2432xf32>
   %rhs_type = pdl.type : tensor<2432x9728xf32>
   %matmul_type = pdl.type : tensor<8192x9728xf32>
@@ -69,11 +69,11 @@ pdl.pattern @mlp : benefit(1) {
   %zero_fill_val_op = pdl.operation "arith.constant" {"value" = %zero_val_attr} -> (%zero_fill_type : !pdl.type)
   %zero_fill_val = pdl.result 0 of %zero_fill_val_op
 
-  
+
   %fill_op = pdl.operation "linalg.fill" (%zero_fill_val, %empty : !pdl.value, !pdl.value) -> (%matmul_type : !pdl.type)
   %fill = pdl.result 0 of %fill_op
   %matmul = pdl.operation "linalg.matmul" (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%matmul_type : !pdl.type)
-  
+
   pdl.rewrite %matmul {
     %i32_type = pdl.type : i32
     %m_op = pdl.operation "arith.constant" {"value" = %fixed_M} -> (%i32_type : !pdl.type)
diff --git a/experimental/delegate/large-matmul.pdl.mlir b/experimental/delegate/large-matmul.pdl.mlir
index 54e6e09a5..6f05c77fd 100644
--- a/experimental/delegate/large-matmul.pdl.mlir
+++ b/experimental/delegate/large-matmul.pdl.mlir
@@ -52,7 +52,7 @@ pdl.pattern @mlp : benefit(1) {
   %lhs = pdl.operand
   %rhs = pdl.operand
   %empty = pdl.operand
-  
+
   %lhs_type = pdl.type : tensor<8192x2432xbf16>
   %rhs_type = pdl.type : tensor<2432x9728xbf16>
   %matmul_type = pdl.type : tensor<8192x9728xf32>
@@ -69,11 +69,11 @@ pdl.pattern @mlp : benefit(1) {
   %zero_fill_val_op = pdl.operation "arith.constant" {"value" = %zero_val_attr} -> (%zero_fill_type : !pdl.type)
   %zero_fill_val = pdl.result 0 of %zero_fill_val_op
 
-  
+
   %fill_op = pdl.operation "linalg.fill" (%zero_fill_val, %empty : !pdl.value, !pdl.value) -> (%matmul_type : !pdl.type)
   %fill = pdl.result 0 of %fill_op
   %matmul = pdl.operation "linalg.matmul" (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%matmul_type : !pdl.type)
-  
+
   pdl.rewrite %matmul {
     %i32_type = pdl.type : i32
     %m_op = pdl.operation "arith.constant" {"value" = %fixed_M} -> (%i32_type : !pdl.type)
diff --git a/experimental/delegate/linalg.pdl.mlir b/experimental/delegate/linalg.pdl.mlir
index 57c5e3810..a5eb2f751 100644
--- a/experimental/delegate/linalg.pdl.mlir
+++ b/experimental/delegate/linalg.pdl.mlir
@@ -59,11 +59,11 @@ pdl.pattern @mlp : benefit(1) {
   %zero_f32_op = pdl.operation "arith.constant" {"value" = %zero_val_f32} -> (%f32_type : !pdl.type)
   %zero_f32 = pdl.result 0 of %zero_f32_op
 
-  
+
   %fill_op = pdl.operation "linalg.fill" (%zero_f32, %empty : !pdl.value, !pdl.value) -> (%matmul_type : !pdl.type)
   %fill = pdl.result 0 of %fill_op
   %matmul = pdl.operation "linalg.matmul" (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%matmul_type : !pdl.type)
-  
+
   pdl.rewrite %matmul {
     // The pattern above matched `%result`, `%lhs`, `%rhs` needed for the
     // external function call. The values of `%M`, `%N` and `%K` need to
@@ -78,7 +78,7 @@ pdl.pattern @mlp : benefit(1) {
     %m_op = pdl.operation "tensor.dim"(%lhs, %zero : !pdl.value, !pdl.value) -> (%index_type : !pdl.type)
     %m = pdl.result 0 of %m_op
     %n_op = pdl.operation "tensor.dim"(%rhs, %one : !pdl.value, !pdl.value) -> (%index_type : !pdl.type)
-    %n = pdl.result 0 of %n_op 
+    %n = pdl.result 0 of %n_op
     %k_op = pdl.operation "tensor.dim"(%lhs, %one : !pdl.value, !pdl.value)
     %k = pdl.result 0 of %k_op
     %m_i32_op = pdl.operation "arith.index_cast"(%m : !pdl.value) -> (%i32_type : !pdl.type)
diff --git a/experimental/delegate/matmul-16k.pdl.mlir b/experimental/delegate/matmul-16k.pdl.mlir
index 3b4f13ffb..def5954e6 100644
--- a/experimental/delegate/matmul-16k.pdl.mlir
+++ b/experimental/delegate/matmul-16k.pdl.mlir
@@ -49,21 +49,21 @@ pdl.pattern @mlp : benefit(1) {
   //   tensor<1x512x16384xbf16>) outs(%64 : tensor<1x16384x16384xbf16>) ->
   //   tensor<1x16384x16384xbf16>
   // ```
-  
+
   %lhs_type = pdl.type : tensor<1x16384x512xbf16>
   %rhs_type = pdl.type : tensor<1x512x16384xbf16>
   %matmul_type = pdl.type : tensor<1x16384x16384xf32>
   %fixed_M = pdl.attribute = 16384 : i32
   %fixed_N = pdl.attribute = 16384 : i32
   %fixed_K = pdl.attribute = 512 : i32
-  
+
   // %index_type = pdl.type : index
 
   %zero_attr = pdl.attribute = 0.0 : f32
   %zero_type = pdl.type : f32
   %zero_op = pdl.operation "arith.constant" {"value" = %zero_attr} -> (%zero_type : !pdl.type)
   %zero = pdl.result 0 of %zero_op
-  
+
   %empty = pdl.operand
   %fill_op = pdl.operation "linalg.fill" (%zero, %empty : !pdl.value, !pdl.value) -> (%matmul_type : !pdl.type)
   %fill = pdl.result 0 of %fill_op
@@ -71,7 +71,7 @@ pdl.pattern @mlp : benefit(1) {
   %lhs = pdl.operand : %lhs_type
   %rhs = pdl.operand : %rhs_type
   %matmul = pdl.operation "linalg.batch_matmul" (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%matmul_type : !pdl.type)
-  
+
   pdl.rewrite %matmul {
     %i32_type = pdl.type : i32
     %m_op = pdl.operation "arith.constant" {"value" = %fixed_M} -> (%i32_type : !pdl.type)
diff --git a/experimental/delegate/mlp_aie_bf16_plugin.cpp b/experimental/delegate/mlp_aie_bf16_plugin.cpp
index e3745cd0b..05f9b33c5 100644
--- a/experimental/delegate/mlp_aie_bf16_plugin.cpp
+++ b/experimental/delegate/mlp_aie_bf16_plugin.cpp
@@ -41,9 +41,9 @@
 #define LARGE_MATMUL_DELEGATE_KERNEL 3
 #define MATMUL_16K_DELEGATE_KERNEL 4
 
-//#############################################################################
+// #############################################################################
 //
-// Macros for configuring AIE delegate behavior
+//  Macros for configuring AIE delegate behavior
 //
 
 // Uncomment the kernel to use
@@ -78,13 +78,13 @@
 // being done
 // #define ENABLE_PERFORMANCE_WARNING 1
 
-//#############################################################################
+// #############################################################################
 
 #if DEBUG_VALUE_CONVERSIONS
-  static bool DebugValueConversions = false;
-  #define CONVERSION_DEBUG(turnOn_) DebugValueConversions = (turnOn_);
+static bool DebugValueConversions = false;
+#define CONVERSION_DEBUG(turnOn_) DebugValueConversions = (turnOn_);
 #else
-  #define CONVERSION_DEBUG(turnOn_) ;
+#define CONVERSION_DEBUG(turnOn_) ;
 #endif
 
 #ifdef ENABLE_TRACE_DELEGATE
@@ -98,13 +98,13 @@
 #define TRACE_DELEGATE1(str_, arg1_)
 #endif
 
-  // Fake bfloat16 type (assuming no C++ 23)
-  using bfloat16_t = std::uint16_t;
+// Fake bfloat16 type (assuming no C++ 23)
+using bfloat16_t = std::uint16_t;
 
-  //#############################################################################
-  //
-  // Configuration of the kernel that the AIE delegate uses
-  //
+// #############################################################################
+//
+//  Configuration of the kernel that the AIE delegate uses
+//
 
 #if DELEGATE_KERNEL_TO_USE == MATMUL_16K_DELEGATE_KERNEL
 // Kernel file names (without extension) relative to installation root
@@ -138,11 +138,12 @@ using ModelReturnDType = float;
 
 #elif DELEGATE_KERNEL_TO_USE == LARGE_MATMUL_DELEGATE_KERNEL
 // Kernel file names (without extension) relative to installation root
-const std::string kernelFileName = 
+const std::string kernelFileName =
     "matmul/matmul-bf16-f32-8192x9728x2432-v1";  // From AIE codegen
 
 // Kernel name inside the xclbin file
-const std::string KernelName = "matmul_8192x9728_2432xbf16__dispatch_0_matmul_81";
+const std::string KernelName =
+    "matmul_8192x9728_2432xbf16__dispatch_0_matmul_81";
 
 // Fixed shape of the matmul kernel
 #define MLP_M 8192
@@ -152,27 +153,27 @@ const std::string KernelName = "matmul_8192x9728_2432xbf16__dispatch_0_matmul_81
 // Types of the matmul LHS, RHS, and result, as defined by the kernel
 using A_DATATYPE = bfloat16_t;
 using B_DATATYPE = bfloat16_t;
-using C_DATATYPE = float; // bfloat16_t;
+using C_DATATYPE = float;  // bfloat16_t;
 
 // Types of the matmul LHS, RHS, and result, as seen by the model
-using ModelLhsDType = float; // bfloat16_t;
-using ModelRhsDType = float; // bfloat16_t;
+using ModelLhsDType = float;  // bfloat16_t;
+using ModelRhsDType = float;  // bfloat16_t;
 using ModelReturnDType = float;
 
 // Set to 1 if the kernel requires a pre-initialized buffer to be loaded
 // into the kernel before the kernel runs
 #define KERNEL_REQUIRES_RESULT_PRELOAD 0
 
-
 //-----------------------------------------------------------------------------
 
 #elif DELEGATE_KERNEL_TO_USE == OPT_DELEGATE_KERNEL
 // Kernel file names (without extension) relative to installation root
-const std::string kernelFileName = 
+const std::string kernelFileName =
     "matmul/matmul-bf16-f32-8x768x768-v1";  // Erwei's 4x4 vector matmul
 
 // Kernel name inside the xclbin file
-const std::string KernelName = "matmul_8x768_768xbf16__dispatch_0_matmul_8x768x7";
+const std::string KernelName =
+    "matmul_8x768_768xbf16__dispatch_0_matmul_8x768x7";
 
 // Fixed shape of the matmul kernel
 #define MLP_M 8
@@ -182,7 +183,7 @@ const std::string KernelName = "matmul_8x768_768xbf16__dispatch_0_matmul_8x768x7
 // Types of the matmul LHS, RHS, and result, as defined by the kernel
 using A_DATATYPE = bfloat16_t;
 using B_DATATYPE = bfloat16_t;
-using C_DATATYPE = float; // bfloat16_t;
+using C_DATATYPE = float;  // bfloat16_t;
 
 // Types of the matmul LHS, RHS, and result, as seen by the model
 using ModelLhsDType = bfloat16_t;
@@ -222,70 +223,70 @@ using ModelReturnDType = float;
 #define KERNEL_REQUIRES_RESULT_PRELOAD 0
 
 #else
-#error "[AIE Delegate]: Unknown kernel.  \
+#error \
+    "[AIE Delegate]: Unknown kernel.  \
 Set DELEGATE_KERNEL_TO_USE to a supported kernel."
 #endif
 
-//#############################################################################
+// #############################################################################
 //
-// AIE delegate implementation
+//  AIE delegate implementation
 //
 
 // Run-time exception class
 class DelegateException : public std::runtime_error {
-public:
+ public:
   DelegateException(const std::string &what) : std::runtime_error(what) {
     TRACE_DELEGATE1("DelegateException: ", what);
   }
 };
 
-
 // Get the path of this plugin's .so
 
 #if defined(_WIN32)
 
 #include <windows.h>
+
 #include <filesystem>
 
 std::string getLibraryPath() {
-    char path[MAX_PATH];
-    HMODULE hm = NULL;
-
-    // Get the currently executing DLL (the delegate DLL)
-    if (GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
-        GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
-        (LPCSTR)&getLibraryPath, &hm) == 0)
-    {
-        int ret = GetLastError();
-        std::ostringstream oss;
-        oss << "[AIE Delegate] FATAL ERROR: Can't open delegate DLL.  Error code: "
-            << ret << std::endl;
-        throw DelegateException(oss.str());
-    }
+  char path[MAX_PATH];
+  HMODULE hm = NULL;
+
+  // Get the currently executing DLL (the delegate DLL)
+  if (GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+                            GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                        (LPCSTR)&getLibraryPath, &hm) == 0) {
+    int ret = GetLastError();
+    std::ostringstream oss;
+    oss << "[AIE Delegate] FATAL ERROR: Can't open delegate DLL.  Error code: "
+        << ret << std::endl;
+    throw DelegateException(oss.str());
+  }
 
-    // Get the file path for the DLL
-    if (GetModuleFileName(hm, path, sizeof(path)) == 0)
-    {
-        int ret = GetLastError();
-        std::ostringstream oss;
-        oss << "[AIE Delegate] FATAL ERROR: Can't read delegate DLL file name."
-            "  Error code: " << ret << std::endl;
-        throw DelegateException(oss.str());
-    }
+  // Get the file path for the DLL
+  if (GetModuleFileName(hm, path, sizeof(path)) == 0) {
+    int ret = GetLastError();
+    std::ostringstream oss;
+    oss << "[AIE Delegate] FATAL ERROR: Can't read delegate DLL file name."
+           "  Error code: "
+        << ret << std::endl;
+    throw DelegateException(oss.str());
+  }
 
-    // Strip off the file name, leaving the DLL's directory
-    std::filesystem::path pathObj(path);
-    std::string dllDir = pathObj.parent_path().string();
-    return std::string(dllDir);
+  // Strip off the file name, leaving the DLL's directory
+  std::filesystem::path pathObj(path);
+  std::string dllDir = pathObj.parent_path().string();
+  return std::string(dllDir);
 }
 #elif defined(__linux__)
 #include <dlfcn.h>
 #include <libgen.h>
 
 std::string getLibraryPath() {
-    Dl_info dl_info;
-    dladdr((void*)getLibraryPath, &dl_info);
-    return dirname(const_cast<char *>(dl_info.dli_fname));
+  Dl_info dl_info;
+  dladdr((void *)getLibraryPath, &dl_info);
+  return dirname(const_cast<char *>(dl_info.dli_fname));
 }
 #else
 std::string getLibraryPath() { return std::string(); }
@@ -302,8 +303,7 @@ std::string getLibraryPath() { return std::string(); }
 
 // Default case: catch unsupported conversions at compile time
 template <typename FROM, typename TO>
-struct Converter {
-};
+struct Converter {};
 
 // If the FROM and TO types are the same, no conversion needed.
 template <typename T>
@@ -325,7 +325,8 @@ struct Converter<float, bfloat16_t> {
     if (DebugValueConversions)
       std::cout << "float to bf16 value conversion" << std::endl;
 #endif
-    bfloat16_t bf = (bfloat16_t) (((*reinterpret_cast<uint32_t*>(&value))) >> 16);
+    bfloat16_t bf =
+        (bfloat16_t)(((*reinterpret_cast<uint32_t *>(&value))) >> 16);
     return bf;
   }
 };
@@ -339,7 +340,7 @@ struct Converter<bfloat16_t, float> {
       std::cout << "bf16 to float value conversion" << std::endl;
 #endif
     uint32_t tmp = uint32_t(value) << 16;
-    float f = *reinterpret_cast<float*>(&tmp);
+    float f = *reinterpret_cast<float *>(&tmp);
     return f;
   }
 };
@@ -350,29 +351,32 @@ struct Converter<bfloat16_t, float> {
 
 // General case: copy can be performed if there is a dtype converter between
 // `SrcType` and `DestType`.
-template<typename SrcType, typename DestType>
+template <typename SrcType, typename DestType>
 struct TensorCopier {
-  static void copy(DestType *destBuf, const SrcType *srcBuf, std::size_t numElements) {
+  static void copy(DestType *destBuf, const SrcType *srcBuf,
+                   std::size_t numElements) {
 #ifdef DEBUG_VALUE_CONVERSIONS
-    std::cout << "TensorCopier: Using general (type converting) copy" << std::endl;
+    std::cout << "TensorCopier: Using general (type converting) copy"
+              << std::endl;
 #endif
     DestType *pDest = destBuf;
-    for (const SrcType *pSrc = srcBuf, *pEnd = srcBuf + numElements; pSrc != pEnd; ++pSrc)
+    for (const SrcType *pSrc = srcBuf, *pEnd = srcBuf + numElements;
+         pSrc != pEnd; ++pSrc)
       *pDest++ = Converter<SrcType, DestType>::convert(*pSrc);
   }
 };
 
 // If source and destination types are the same, no dtype conversion is needed,
 // and a straight memcpy can be performed.
-template<typename T>
+template <typename T>
 struct TensorCopier<T, T> {
   static void copy(T *destBuf, const T *srcBuf, std::size_t numElements) {
 #ifdef DEBUG_VALUE_CONVERSIONS
     std::cout << "TensorCopier: Using memcpy" << std::endl;
-    std::cout << "TensorCopier: destBuf = " << (void *) destBuf
-      << ", srcBuf = " << (void *) srcBuf << std::endl;
+    std::cout << "TensorCopier: destBuf = " << (void *)destBuf
+              << ", srcBuf = " << (void *)srcBuf << std::endl;
     std::cout << "TensorCopier: numElements = " << numElements
-      << ", sizeof(T) = " << sizeof(T) << std::endl;
+              << ", sizeof(T) = " << sizeof(T) << std::endl;
 #endif
     std::memcpy(destBuf, srcBuf, numElements * sizeof(T));
   }
@@ -383,7 +387,7 @@ struct TensorCopier<T, T> {
 // The layout of this struct must match the calling convention for the plugin.
 template <typename T>
 struct TensorData {
-  T* data;
+  T *data;
   size_t offset;
 
   size_t getIndex(size_t i, size_t j, size_t stride) const {
@@ -391,20 +395,16 @@ struct TensorData {
   }
 
   T getElement(size_t i, size_t j, size_t stride) const {
-    return data[getIndex(i,j, stride)];
+    return data[getIndex(i, j, stride)];
   }
 
   void setElement(size_t i, size_t j, size_t stride, float val) {
-    data[getIndex(i,j, stride)] = val;
+    data[getIndex(i, j, stride)] = val;
   }
 
-  T *get() {
-    return data + offset;
-  }
+  T *get() { return data + offset; }
 
-  const T *get() const {
-    return data + offset;
-  }
+  const T *get() const { return data + offset; }
 
   void dumpVals(std::ostream &os, std::size_t numElements) const {
     for (const T *p = get(), *pEnd = get() + numElements; p != pEnd; ++p)
@@ -413,7 +413,7 @@ struct TensorData {
   }
 
   std::ostream &dump(std::ostream &os) const {
-    return os << "data: " << (void *) data << ", offset: " << offset;
+    return os << "data: " << (void *)data << ", offset: " << offset;
   }
 
   friend std::ostream &operator<<(std::ostream &os, const TensorData &td) {
@@ -427,14 +427,15 @@ struct TensorData {
 // Functionality common to all variants of tensor binder
 template <typename ModelDType, typename KernelDType, typename ModelDataPtr>
 class TensorBinderCommon {
-protected:
+ protected:
   xrt::device device;
   int memoryBank = 0;
   std::size_t xrtBufferNumBytes;  // fixed size of XRT buffer
   xrt::bo bo;
 
   // Make sure that the XRT buffer is large enough to handle the model tensor
-  void checkBufferSizes(ModelDataPtr modelTensorData, std::size_t numModelElements) {
+  void checkBufferSizes(ModelDataPtr modelTensorData,
+                        std::size_t numModelElements) {
     std::size_t modelBufferNumBytes = numModelElements * sizeof(KernelDType);
     if (modelBufferNumBytes > xrtBufferNumBytes) {
       std::ostringstream oss;
@@ -445,28 +446,30 @@ class TensorBinderCommon {
     }
   }
 
-public:
-  TensorBinderCommon(xrt::device device, int memoryBank, std::size_t xrtBufferNumBytes)
-  : device(device), memoryBank(memoryBank), xrtBufferNumBytes(xrtBufferNumBytes)
-  {}
+ public:
+  TensorBinderCommon(xrt::device device, int memoryBank,
+                     std::size_t xrtBufferNumBytes)
+      : device(device),
+        memoryBank(memoryBank),
+        xrtBufferNumBytes(xrtBufferNumBytes) {}
 
   virtual ~TensorBinderCommon() {}
   xrt::bo getBo() { return bo; }
 };
 
-
 // Class for binding a HAL buffer to an XRT buffer (BO).
 //
 // In the general case, the HAL buffer is separate from the XRT buffer, so that
 // memory copies are done between the HAL buffer and XRT buffer
 template <typename ModelDType, typename KernelDType, typename ModelDataPtr>
-class TensorBinderBase : public TensorBinderCommon<ModelDType, KernelDType, ModelDataPtr> {
-protected:
+class TensorBinderBase
+    : public TensorBinderCommon<ModelDType, KernelDType, ModelDataPtr> {
+ protected:
   std::size_t numModelElements;  // number of elements in model tensor
-  ModelDataPtr modelTensorData = ModelDataPtr(); // pointer to HAL buffer
+  ModelDataPtr modelTensorData = ModelDataPtr();  // pointer to HAL buffer
   bool isInitialized = false;
 
-public:
+ public:
   using CommonClass = TensorBinderCommon<ModelDType, KernelDType, ModelDataPtr>;
   using CommonClass::CommonClass;
 
@@ -474,7 +477,7 @@ class TensorBinderBase : public TensorBinderCommon<ModelDType, KernelDType, Mode
     CommonClass::checkBufferSizes(modelTensorData, numModelElements);
     if (!isInitialized || numModelElements != this->numModelElements) {
       this->bo = xrt::bo(this->device, this->xrtBufferNumBytes,
-          XRT_BO_FLAGS_HOST_ONLY, this->memoryBank);
+                         XRT_BO_FLAGS_HOST_ONLY, this->memoryBank);
       isInitialized = true;
     }
     this->modelTensorData = modelTensorData;
@@ -484,16 +487,15 @@ class TensorBinderBase : public TensorBinderCommon<ModelDType, KernelDType, Mode
   void copyModelToXrt() {
     KernelDType *xrtBuf = this->bo.template map<KernelDType *>();
 #ifdef ENABLE_PERFORMANCE_WARNING
-    std::cout << "[AIE Delegate]: PERFORMANCE WARNING: using extra buffer copy!" << std::endl;
+    std::cout << "[AIE Delegate]: PERFORMANCE WARNING: using extra buffer copy!"
+              << std::endl;
 #endif
     TensorCopier<ModelDType, KernelDType>::copy(xrtBuf, modelTensorData,
-        numModelElements);
+                                                numModelElements);
     this->bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
   }
-
 };
 
-
 #ifndef USE_INDIRECT_XRT_BUFFERS
 // Special case for binding a HAL buffer directly to an XRT buffer (BO),
 // so that they share the same memory.
@@ -501,8 +503,9 @@ class TensorBinderBase : public TensorBinderCommon<ModelDType, KernelDType, Mode
 // This class can be used only if the Model (HAL) and kernel (XRT) data types
 // match.
 template <typename DType, typename ModelDataPtr>
-class TensorBinderBase<DType, DType, ModelDataPtr> : public TensorBinderCommon<DType, DType, ModelDataPtr> {
-public:
+class TensorBinderBase<DType, DType, ModelDataPtr>
+    : public TensorBinderCommon<DType, DType, ModelDataPtr> {
+ public:
   using CommonClass = TensorBinderCommon<DType, DType, ModelDataPtr>;
   using CommonClass::CommonClass;
 
@@ -512,30 +515,29 @@ class TensorBinderBase<DType, DType, ModelDataPtr> : public TensorBinderCommon<D
     // std::cout << "Using direct buffers" << std::endl;
 
     // Construct BO every time, as HAL buffer can be different with every call
-    this->bo = xrt::bo(this->device, (void *) modelTensorData,
-        this->xrtBufferNumBytes, this->memoryBank);
+    this->bo = xrt::bo(this->device, (void *)modelTensorData,
+                       this->xrtBufferNumBytes, this->memoryBank);
   }
 
-  void copyModelToXrt() {
-    this->bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  }
+  void copyModelToXrt() { this->bo.sync(XCL_BO_SYNC_BO_TO_DEVICE); }
 };
 #endif
 
-
 // TensorBinder whose HAL buffer CANNOT be written to
 template <typename ModelDType, typename KernelDType>
-class ConstTensorBinder : public TensorBinderBase<ModelDType, KernelDType, const ModelDType *> {
-public:
-  using BaseClass = TensorBinderBase<ModelDType, KernelDType, const ModelDType *>;
+class ConstTensorBinder
+    : public TensorBinderBase<ModelDType, KernelDType, const ModelDType *> {
+ public:
+  using BaseClass =
+      TensorBinderBase<ModelDType, KernelDType, const ModelDType *>;
   using BaseClass::BaseClass;
 };
 
-
 // TensorBinder whose HAL buffer CAN be written to, default case
 template <typename ModelDType, typename KernelDType>
-class MutableTensorBinder : public TensorBinderBase<ModelDType, KernelDType, ModelDType *> {
-public:
+class MutableTensorBinder
+    : public TensorBinderBase<ModelDType, KernelDType, ModelDType *> {
+ public:
   using BaseClass = TensorBinderBase<ModelDType, KernelDType, ModelDType *>;
   using BaseClass::BaseClass;
 
@@ -543,29 +545,27 @@ class MutableTensorBinder : public TensorBinderBase<ModelDType, KernelDType, Mod
     this->bo.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
     KernelDType *xrtBuf = this->bo.template map<KernelDType *>();
 #ifdef ENABLE_PERFORMANCE_WARNING
-    std::cout << "[AIE Delegate]: PERFORMANCE WARNING: using extra buffer copy!" << std::endl;
+    std::cout << "[AIE Delegate]: PERFORMANCE WARNING: using extra buffer copy!"
+              << std::endl;
 #endif
     TensorCopier<KernelDType, ModelDType>::copy(this->modelTensorData, xrtBuf,
-        this->numModelElements);
+                                                this->numModelElements);
   }
 };
 
-
 #ifndef USE_INDIRECT_XRT_BUFFERS
 // TensorBinder whose HAL buffer CAN be written to, conversion not required
 template <typename DType>
-class MutableTensorBinder<DType, DType> : public TensorBinderBase<DType, DType, DType *> {
-public:
+class MutableTensorBinder<DType, DType>
+    : public TensorBinderBase<DType, DType, DType *> {
+ public:
   using BaseClass = TensorBinderBase<DType, DType, DType *>;
   using BaseClass::BaseClass;
 
-  void copyXrtToModel() {
-    this->bo.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-  }
+  void copyXrtToModel() { this->bo.sync(XCL_BO_SYNC_BO_FROM_DEVICE); }
 };
 #endif
 
-
 // Set of all arguments passed from model to plugin
 //
 // The layout of this struct must match the calling convention for the plugin.
@@ -584,7 +584,8 @@ struct Params {
   }
 
   std::ostream &dump(std::ostream &os) const {
-    return os << "lhs: (" << lhs << "), rhs: (" << rhs << "), result: " << result << ")";
+    return os << "lhs: (" << lhs << "), rhs: (" << rhs
+              << "), result: " << result << ")";
   }
 
   friend std::ostream &operator<<(std::ostream &os, const Params &p) {
@@ -592,9 +593,9 @@ struct Params {
   }
 };
 
-//#############################################################################
+// #############################################################################
 //
-// XRT host code implementation, adapted from Joe Melber's mlir-aie ref matmul
+//  XRT host code implementation, adapted from Joe Melber's mlir-aie ref matmul
 //
 
 std::vector<uint32_t> loadInstrSequence(std::string instr_path) {
@@ -619,29 +620,28 @@ std::vector<uint32_t> loadInstrSequence(std::string instr_path) {
 // Holder of AIE hardware resources of which there should be only one of each.
 // This class is used as a singleton via `getInstance()`.
 struct XrtState {
-    using LhsBinder = ConstTensorBinder<ModelLhsDType, A_DATATYPE>;
-    using RhsBinder = ConstTensorBinder<ModelRhsDType, B_DATATYPE>;
-    using ResultBinder = MutableTensorBinder<ModelReturnDType, C_DATATYPE>;
-
-    xrt::device device;
-    xrt::kernel kernel;
-    xrt::bo boInstr;
-    std::unique_ptr<LhsBinder> lhsBinder;
-    std::unique_ptr<RhsBinder> rhsBinder;
-    std::unique_ptr<ResultBinder> resultBinder;
-
-    static XrtState *getInstance(bool shouldDelete = false) {
-        // TODO: handle multiple simultaneous dispatches, multiple kernels
-        static XrtState *instance = nullptr;
-        if (shouldDelete) {
-            delete instance;
-            instance = nullptr;
-            return nullptr;
-        }
-        if (instance == nullptr)
-            instance = new XrtState();
-        return instance;
+  using LhsBinder = ConstTensorBinder<ModelLhsDType, A_DATATYPE>;
+  using RhsBinder = ConstTensorBinder<ModelRhsDType, B_DATATYPE>;
+  using ResultBinder = MutableTensorBinder<ModelReturnDType, C_DATATYPE>;
+
+  xrt::device device;
+  xrt::kernel kernel;
+  xrt::bo boInstr;
+  std::unique_ptr<LhsBinder> lhsBinder;
+  std::unique_ptr<RhsBinder> rhsBinder;
+  std::unique_ptr<ResultBinder> resultBinder;
+
+  static XrtState *getInstance(bool shouldDelete = false) {
+    // TODO: handle multiple simultaneous dispatches, multiple kernels
+    static XrtState *instance = nullptr;
+    if (shouldDelete) {
+      delete instance;
+      instance = nullptr;
+      return nullptr;
     }
+    if (instance == nullptr) instance = new XrtState();
+    return instance;
+  }
 };
 
 constexpr int M = MLP_M;
@@ -817,42 +817,47 @@ void aie_matmul(Params *params) {
   TRACE_DELEGATE("aie_matmul done");
 }
 
-//#############################################################################
+// #############################################################################
 //
-// Reference scalar CPU implementation, adapted from Mahesh's CPU delegate
-// in iree/samples/custom_dispatch/cpu/mlp_plugin
+//  Reference scalar CPU implementation, adapted from Mahesh's CPU delegate
+//  in iree/samples/custom_dispatch/cpu/mlp_plugin
 //
 
 // Type for accumulating the multiplications over the k dimension
-using CpuAccDType = 
+using CpuAccDType =
 #ifdef USE_BF16_CPU_ACCUMULATOR
-  bfloat16_t;
+    bfloat16_t;
 #else
-  float;
+    float;
 #endif
 
 static void cpu_matmul(Params *params) {
-  std::cout << "[AIE Delegate]: Computing CPU scalar matmul of " << params->getShapeStr() << std::endl;
+  std::cout << "[AIE Delegate]: Computing CPU scalar matmul of "
+            << params->getShapeStr() << std::endl;
   for (int32_t i = 0; i < params->M; i++) {
     for (int32_t j = 0; j < params->N; j++) {
       CpuAccDType curr_result = Converter<float, CpuAccDType>::convert(0.0);
       for (int32_t k = 0; k < params->K; k++) {
-        float a = Converter<ModelLhsDType, float>::convert(params->lhs.getElement(i, k, K));
-        float b = Converter<ModelRhsDType, float>::convert(params->rhs.getElement(k, j, N));
+        float a = Converter<ModelLhsDType, float>::convert(
+            params->lhs.getElement(i, k, K));
+        float b = Converter<ModelRhsDType, float>::convert(
+            params->rhs.getElement(k, j, N));
         curr_result = Converter<float, CpuAccDType>::convert(
-          Converter<CpuAccDType, float>::convert(curr_result)
-          + Converter<float, CpuAccDType>::convert(a * b)
-        );
+            Converter<CpuAccDType, float>::convert(curr_result) +
+            Converter<float, CpuAccDType>::convert(a * b));
       }
-      // curr_result = curr_result < 0.0 ? 0.0 : curr_result;  ref matmul doesn't seem to have this
-      params->result.setElement(i, j, N, Converter<CpuAccDType, ModelReturnDType>::convert(curr_result));
+      // curr_result = curr_result < 0.0 ? 0.0 : curr_result;  ref matmul
+      // doesn't seem to have this
+      params->result.setElement(
+          i, j, N,
+          Converter<CpuAccDType, ModelReturnDType>::convert(curr_result));
     }
   }
 }
 
-//#############################################################################
+// #############################################################################
 //
-// Implementation of API of IREE Dynamic Plugin
+//  Implementation of API of IREE Dynamic Plugin
 //
 
 // Stateful plugin instance.
@@ -861,10 +866,9 @@ static void cpu_matmul(Params *params) {
 // context argument.
 typedef struct {
   iree_hal_executable_plugin_allocator_t host_allocator;
-  FILE* file;
+  FILE *file;
 } mlp_plugin_t;
 
-
 // `ret = mlp(lhs, rhs)`
 //
 // Conforms to ABI:
@@ -887,10 +891,11 @@ typedef struct {
 //
 // Expects a return of 0 on success and any other value indicates failure.
 // Try not to fail!
-static int mlp_external(void* params_ptr, void* context, void* reserved) {
+static int mlp_external(void *params_ptr, void *context, void *reserved) {
   auto plugin = reinterpret_cast<mlp_plugin_t *>(context);
   auto params = reinterpret_cast<Params *>(params_ptr);
-  // fprintf(plugin->file, "[AIE Delegate]: M = %d, N = %d, K = %d\n", params->M,
+  // fprintf(plugin->file, "[AIE Delegate]: M = %d, N = %d, K = %d\n",
+  // params->M,
   //         params->N, params->K);
   TRACE_DELEGATE("mlp_external");
 
@@ -901,12 +906,13 @@ static int mlp_external(void* params_ptr, void* context, void* reserved) {
   // make sure AIE version is getting used
   if (params->M != MLP_M || params->K != MLP_K || params->N != MLP_N) {
     std::ostringstream oss;
-    oss << "[AIE Delegate] FATAL ERROR: Shape mismatch between model and kernel."
+    oss << "[AIE Delegate] FATAL ERROR: Shape mismatch between model and "
+           "kernel."
+        << std::endl;
+    oss << "    Model shape: M=" << params->M << ", N=" << params->N
+        << ", K=" << params->K << std::endl;
+    oss << "    Kernel shape: M=" << MLP_M << ", N=" << MLP_N << ", K=" << MLP_K
         << std::endl;
-    oss << "    Model shape: M=" << params->M << ", N=" << params->N << ", K="
-        << params->K << std::endl;
-    oss << "    Kernel shape: M=" << MLP_M << ", N=" << MLP_N << ", K="
-        << MLP_K << std::endl;
     throw DelegateException(oss.str());
   }
 
@@ -926,15 +932,15 @@ static int mlp_external(void* params_ptr, void* context, void* reserved) {
 // instance. Note that there may be multiple instances of a plugin in any
 // particular process and this must be thread-safe.
 static iree_hal_executable_plugin_status_t mlp_plugin_load(
-    const iree_hal_executable_plugin_environment_v0_t* environment,
-    size_t param_count, const iree_hal_executable_plugin_string_pair_t* params,
-    void** out_self) {
+    const iree_hal_executable_plugin_environment_v0_t *environment,
+    size_t param_count, const iree_hal_executable_plugin_string_pair_t *params,
+    void **out_self) {
   TRACE_DELEGATE("mlp_plugin_load");
   // Allocate the plugin state.
-  mlp_plugin_t* plugin = NULL;
+  mlp_plugin_t *plugin = NULL;
   iree_hal_executable_plugin_status_t status =
       iree_hal_executable_plugin_allocator_malloc(
-          environment->host_allocator, sizeof(*plugin), (void**)&plugin);
+          environment->host_allocator, sizeof(*plugin), (void **)&plugin);
   if (status) return status;
   plugin->host_allocator = environment->host_allocator;
 
@@ -954,8 +960,8 @@ static iree_hal_executable_plugin_status_t mlp_plugin_load(
 }
 
 // Called to free any plugin state allocated in load.
-static void mlp_plugin_unload(void* self) {
-  mlp_plugin_t* plugin = (mlp_plugin_t*)self;
+static void mlp_plugin_unload(void *self) {
+  mlp_plugin_t *plugin = (mlp_plugin_t *)self;
   iree_hal_executable_plugin_allocator_t host_allocator =
       plugin->host_allocator;
 
@@ -965,7 +971,7 @@ static void mlp_plugin_unload(void* self) {
   plugin->file = NULL;
 
 #ifndef USE_CPU_IMPLEMENTATION
-  XrtState::getInstance(true); // delete singleton data
+  XrtState::getInstance(true);  // delete singleton data
 #endif
 
   // Free the plugin state using the same allocator it came from.
@@ -976,15 +982,15 @@ static void mlp_plugin_unload(void* self) {
 // See the plugin API header for more information. Note that some of the
 // functions may already be resolved and some may be optional.
 static iree_hal_executable_plugin_status_t mlp_plugin_resolve(
-    void* self, const iree_hal_executable_plugin_resolve_params_v0_t* params,
-    iree_hal_executable_plugin_resolution_t* out_resolution) {
+    void *self, const iree_hal_executable_plugin_resolve_params_v0_t *params,
+    iree_hal_executable_plugin_resolution_t *out_resolution) {
   TRACE_DELEGATE("mlp_plugin_resolve");
-  mlp_plugin_t* plugin = (mlp_plugin_t*)self;
+  mlp_plugin_t *plugin = (mlp_plugin_t *)self;
   *out_resolution = 0;
   bool any_required_not_found = false;
   for (size_t i = 0; i < params->count; ++i) {
     if (params->out_fn_ptrs[i]) continue;
-    const char* symbol_name = params->symbol_names[i];
+    const char *symbol_name = params->symbol_names[i];
     bool is_optional =
         iree_hal_executable_plugin_import_is_optional(symbol_name);
     if (is_optional) ++symbol_name;
@@ -1008,7 +1014,6 @@ static iree_hal_executable_plugin_status_t mlp_plugin_resolve(
              : iree_hal_executable_plugin_ok_status();
 }
 
-
 extern "C" {
 
 // Exported on the shared library and used by the runtime to query the plugin
@@ -1016,9 +1021,9 @@ extern "C" {
 // can be called and can have any name to allow for multiple plugins. When
 // dynamically linking the exported symbol must be exactly this with no C++
 // name mangling.
-IREE_HAL_EXECUTABLE_PLUGIN_EXPORT const iree_hal_executable_plugin_header_t**
+IREE_HAL_EXECUTABLE_PLUGIN_EXPORT const iree_hal_executable_plugin_header_t **
 iree_hal_executable_plugin_query(
-    iree_hal_executable_plugin_version_t max_version, void* reserved) {
+    iree_hal_executable_plugin_version_t max_version, void *reserved) {
   static const iree_hal_executable_plugin_header_t header = {
       // Declares what library version is present: newer runtimes may support
       // loading older plugins but newer plugins cannot load on older runtimes.
@@ -1039,8 +1044,7 @@ iree_hal_executable_plugin_query(
       .resolve = mlp_plugin_resolve,
   };
   return max_version <= IREE_HAL_EXECUTABLE_PLUGIN_VERSION_LATEST
-             ? (const iree_hal_executable_plugin_header_t**)&plugin
+             ? (const iree_hal_executable_plugin_header_t **)&plugin
              : NULL;
 }
-
 }
diff --git a/experimental/delegate/mlp_spec_matmul_elementwise.mlir b/experimental/delegate/mlp_spec_matmul_elementwise.mlir
index 42bef126b..ac9fcb102 100644
--- a/experimental/delegate/mlp_spec_matmul_elementwise.mlir
+++ b/experimental/delegate/mlp_spec_matmul_elementwise.mlir
@@ -67,11 +67,11 @@ module attributes {transform.with_named_sequence} {
             ins(%lhs, %rhs : tensor<8192x2432xf32>, tensor<2432x9728xf32>)
                 outs(%fill :  tensor<8192x9728xf32>) ->  tensor<8192x9728xf32>
       %add = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, 
-                        affine_map<(d0, d1) -> (d1)>, 
-                        affine_map<(d0, d1) -> (d0, d1)>], 
-        iterator_types = ["parallel", "parallel"]} 
-        ins(%matmul, %bias : tensor<8192x9728xf32>, tensor<9728xf32>) 
+        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                        affine_map<(d0, d1) -> (d1)>,
+                        affine_map<(d0, d1) -> (d0, d1)>],
+        iterator_types = ["parallel", "parallel"]}
+        ins(%matmul, %bias : tensor<8192x9728xf32>, tensor<9728xf32>)
         outs(%init2 : tensor<8192x9728xf32>) {
       ^bb0(%in: f32, %in_18629: f32, %out: f32):
         %33290 = arith.addf %in, %in_18629 : f32
diff --git a/experimental/delegate/opt.mlir b/experimental/delegate/opt.mlir
index 897148a02..cd78707fa 100644
--- a/experimental/delegate/opt.mlir
+++ b/experimental/delegate/opt.mlir
@@ -51,7 +51,7 @@ module @example attributes {hal.device.targets = [#cpu_target]} {
       %884 = arith.addf %in, %in_372 : bf16
       linalg.yield %884 : bf16
     } -> tensor<1x8x768xbf16>
-    
+
     %cst_191 = arith.constant dense<1.31072e+05> : tensor<768xbf16>
     %69 = linalg.batch_matmul ins(%66, %rhs : tensor<1x8x768xbf16>, tensor<1x768x768xbf16>) outs(%64 : tensor<1x8x768xbf16>) -> tensor<1x8x768xbf16>
 
diff --git a/experimental/delegate/opt.pdl.mlir b/experimental/delegate/opt.pdl.mlir
index d46a5602d..985e1a190 100644
--- a/experimental/delegate/opt.pdl.mlir
+++ b/experimental/delegate/opt.pdl.mlir
@@ -52,7 +52,7 @@ pdl.pattern @mlp : benefit(1) {
   %lhs = pdl.operand
   %rhs = pdl.operand
   %empty = pdl.operand
-  
+
   %lhs_type = pdl.type : tensor<1x8x768xbf16>
   %rhs_type = pdl.type : tensor<1x768x768xbf16>
   %matmul_type = pdl.type : tensor<1x8x768xbf16>
@@ -69,11 +69,11 @@ pdl.pattern @mlp : benefit(1) {
   %zero_bf16_op = pdl.operation "arith.constant" {"value" = %zero_val_bf16} -> (%bf16_type : !pdl.type)
   %zero_bf16 = pdl.result 0 of %zero_bf16_op
 
-  
+
   %fill_op = pdl.operation "linalg.fill" (%zero_bf16, %empty : !pdl.value, !pdl.value) -> (%matmul_type : !pdl.type)
   %fill = pdl.result 0 of %fill_op
   %matmul = pdl.operation "linalg.batch_matmul" (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%matmul_type : !pdl.type)
-  
+
   pdl.rewrite %matmul {
     %i32_type = pdl.type : i32
     %m_op = pdl.operation "arith.constant" {"value" = %fixed_M} -> (%i32_type : !pdl.type)
diff --git a/runtime/src/iree-amd-aie/aie_runtime/AMDAIEEnums.cpp b/runtime/src/iree-amd-aie/aie_runtime/AMDAIEEnums.cpp
index 3955aedd1..39a8caecc 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/AMDAIEEnums.cpp
+++ b/runtime/src/iree-amd-aie/aie_runtime/AMDAIEEnums.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree-amd-aie/aie_runtime/AMDAIEEnums.h"
+
 #include "mlir/Dialect/Arith/IR/Arith.h"
 
 #define GET_ATTRDEF_CLASSES
diff --git a/runtime/src/iree-amd-aie/aie_runtime/CMakeLists.txt b/runtime/src/iree-amd-aie/aie_runtime/CMakeLists.txt
index b25901208..7140a7b2c 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/CMakeLists.txt
+++ b/runtime/src/iree-amd-aie/aie_runtime/CMakeLists.txt
@@ -76,4 +76,3 @@ target_link_libraries(iree-amd-aie_aie_runtime_iree_aie_runtime_static
 if (IREE_BUILD_TESTS)
   add_subdirectory(test)
 endif()
-
diff --git a/runtime/src/iree-amd-aie/aie_runtime/d_ary_heap.h b/runtime/src/iree-amd-aie/aie_runtime/d_ary_heap.h
index e34833ddc..ce54f5e88 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/d_ary_heap.h
+++ b/runtime/src/iree-amd-aie/aie_runtime/d_ary_heap.h
@@ -358,4 +358,4 @@ class d_ary_heap_indirect
 };
 
 #endif // D_ARY_HEAP_HPP
-// clang-format on
\ No newline at end of file
+// clang-format on
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h
index 449eb6ca2..78d355a94 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h
@@ -23,8 +23,7 @@ struct Port {
 
   // mlir-air legacy
   Port() : bundle(), channel() {}
-  Port(StrmSwPortType b, int c)
-      : bundle(b), channel(c) {}
+  Port(StrmSwPortType b, int c) : bundle(b), channel(c) {}
   typedef std::tuple<StrmSwPortType, int> TupleType;
   Port(TupleType t) : Port(std::get<0>(t), std::get<1>(t)) {}
   operator TupleType() const { return {bundle, channel}; }
@@ -174,7 +173,6 @@ TO_STRINGS(TO_STRING_DECL)
   _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::PhysPortAndID) \
   _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::PhysPort::Direction)
 
-
 BOTH_OSTREAM_OPS_FORALL_ROUTER_TYPES(OSTREAM_OP_DECL, BOTH_OSTREAM_OP)
 
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
index ec0287416..0029312f7 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
@@ -700,8 +700,8 @@ static constexpr uint32_t getElementTypeKey(uint32_t a, uint32_t b,
 /// This first line says that if 'lhs' is an i8 tensor, 'rhs' is an i4 tensor
 /// and 'accumulator' is an i32 tensor, then there is an AIE instruction for
 /// matmul with m = 4, n = 8, k = 16.
-static llvm::DenseMap<uint32_t, std::array<uint32_t, 3>>
-    &getNpu1IntegerMatmulInstructionSizeMap() {
+static llvm::DenseMap<uint32_t, std::array<uint32_t, 3>> &
+getNpu1IntegerMatmulInstructionSizeMap() {
   // Sanity check.
   static_assert(getElementTypeKey(1, 2, 3) == 1 + 2 * 256 + 3 * 65536);
 
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
index fa4ac7e0b..b7719a43b 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
@@ -237,28 +237,31 @@ struct AMDAIEDeviceModel {
     /// Constant specifying the number of inter-iteration dimension for DMA
     /// operations.
     ///
-    /// NOTE(jornt): this number is implicitly assumed in the device model and can't
-    /// be retrieved from it afaik.
+    /// NOTE(jornt): this number is implicitly assumed in the device model and
+    /// can't be retrieved from it afaik.
     ///
     /// Some background:
     ///
-    /// DMAs support multi-dimensional addressing through buffer descriptors in two
-    /// ways:
-    /// 1. Intra-iteration access pattern. Specified via 'strides' ('steps' in buffer
-    /// descriptor lingo), 'sizes' ('wraps' in buffer descriptro lingo) and
-    /// 'padding'. When a DMA executes a buffer descriptor, it will access the data
-    /// (read/write) as specified by the intra-iteration access pattern.
+    /// DMAs support multi-dimensional addressing through buffer descriptors in
+    /// two ways:
+    /// 1. Intra-iteration access pattern. Specified via 'strides' ('steps' in
+    /// buffer descriptor lingo), 'sizes' ('wraps' in buffer descriptro lingo)
+    /// and 'padding'. When a DMA executes a buffer descriptor, it will access
+    /// the data (read/write) as specified by the intra-iteration access
+    /// pattern.
     /// 2. Inter-iteration access pattern. Specified via an iteration 'stride',
-    /// 'size' and 'current_iteration' ('stride' is the same as 'stepsize' and 'size'
-    /// is the same as 'wrap' in buffer descriptor lingo). Here, 'current_iteration'
-    /// keeps track of the current execution iteration of the buffer descriptor and
-    /// is incremented after buffer descriptor execution. the 'stride' is the offset
-    /// to be used for each execution of the buffer descriptor, relative to the
-    /// previous one. When 'iteration_current' is equal to 'size', the
-    /// 'iteration_current' is reset to zero.
+    /// 'size' and 'current_iteration' ('stride' is the same as 'stepsize' and
+    /// 'size' is the same as 'wrap' in buffer descriptor lingo). Here,
+    /// 'current_iteration' keeps track of the current execution iteration of
+    /// the buffer descriptor and is incremented after buffer descriptor
+    /// execution. the 'stride' is the offset to be used for each execution of
+    /// the buffer descriptor, relative to the previous one. When
+    /// 'iteration_current' is equal to 'size', the 'iteration_current' is reset
+    /// to zero.
     ///
-    /// Although DMAs can have a different number of intra-iteration dimensions, all
-    /// DMAs have a single inter-iteration dimension (at least in AIE2 and AIE2p).
+    /// Although DMAs can have a different number of intra-iteration dimensions,
+    /// all DMAs have a single inter-iteration dimension (at least in AIE2 and
+    /// AIE2p).
     uint8_t dmaNbInterDims = 1;
     /// The number of shim tile rows. Not found in aie-rt data structures, but
     /// provided as `XAIE_SHIM_NUM_ROWS`.
diff --git a/runtime/src/iree-amd-aie/aie_runtime/test/CMakeLists.txt b/runtime/src/iree-amd-aie/aie_runtime/test/CMakeLists.txt
index 45f497d5f..fafd44a6b 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/test/CMakeLists.txt
+++ b/runtime/src/iree-amd-aie/aie_runtime/test/CMakeLists.txt
@@ -37,7 +37,7 @@ iree_cc_test(
     test_0335_aie_dma_tile_dma_packet_switch_mode
   SRCS
     test_packet_switch_mode.cc
-  COPTS    
+  COPTS
     $<$<PLATFORM_ID:Linux>:-Wno-format>
     $<$<PLATFORM_ID:Darwin>:-Wno-format>
     $<$<PLATFORM_ID:Windows>:/wd4777>
diff --git a/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc b/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc
index 635c4eaa8..bef5da2a5 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc
+++ b/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc
@@ -18,7 +18,8 @@ TEST(AMSelGeneratorTest, TileNotInitialized) {
   TileLoc tileLoc(0, 1);
   PhysPortAndID src1 = {
       {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(failed(generator.addConnection(tileLoc, src1, {dst1})));
 }
 
@@ -26,8 +27,10 @@ TEST(AMSelGeneratorTest, NoArbitersNoMSels) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 0, 0);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(failed(generator.solve()));
 }
@@ -36,8 +39,10 @@ TEST(AMSelGeneratorTest, NoArbiters) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 0, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(failed(generator.solve()));
 }
@@ -46,8 +51,10 @@ TEST(AMSelGeneratorTest, NoMSels) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 0);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(failed(generator.solve()));
 }
@@ -56,19 +63,24 @@ TEST(AMSelGeneratorTest, SingleSrcSingleDst) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(succeeded(generator.solve()));
   EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0));
   for (int i = 1; i < 6; i++) {
-    PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, i}, PhysPort::Direction::SRC}, i};
-    PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, i}, PhysPort::Direction::DST}, i};
+    PhysPortAndID src2 = {
+        {{0, 1}, {StrmSwPortType::SOUTH, i}, PhysPort::Direction::SRC}, i};
+    PhysPortAndID dst2 = {
+        {{0, 1}, {StrmSwPortType::NORTH, i}, PhysPort::Direction::DST}, i};
     EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst2})));
   }
   EXPECT_TRUE(succeeded(generator.solve()));
   for (int i = 0; i < 6; i++) {
-    PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, i}, PhysPort::Direction::SRC}, i};
+    PhysPortAndID src = {
+        {{0, 1}, {StrmSwPortType::SOUTH, i}, PhysPort::Direction::SRC}, i};
     EXPECT_EQ(generator.getAMSel(tileLoc, src).value(), amsel(i, 0));
   }
 }
@@ -80,13 +92,16 @@ TEST(AMSelGeneratorTest, SingleSrcSingleDstSamePorts) {
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
   for (int i = 0; i < 6; i++) {
-    PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, i};
-    PhysPortAndID dst = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, i};
+    PhysPortAndID src = {
+        {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, i};
+    PhysPortAndID dst = {
+        {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, i};
     EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src, {dst})));
   }
   EXPECT_TRUE(succeeded(generator.solve()));
   for (int i = 0; i < 6; i++) {
-    PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, i};
+    PhysPortAndID src = {
+        {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, i};
     EXPECT_EQ(generator.getAMSel(tileLoc, src).value(), amsel(0, 0));
   }
 }
@@ -95,15 +110,21 @@ TEST(AMSelGeneratorTest, SingleSrcMultiDst) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::EAST, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst2 = {
+      {{0, 1}, {StrmSwPortType::EAST, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1, dst2})));
   EXPECT_TRUE(succeeded(generator.solve()));
   EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0));
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
-  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1};
-  PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::EAST, 1}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID src2 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID dst3 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID dst4 = {
+      {{0, 1}, {StrmSwPortType::EAST, 1}, PhysPort::Direction::DST}, 1};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst3, dst4})));
   EXPECT_TRUE(succeeded(generator.solve()));
   EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(1, 0));
@@ -114,19 +135,27 @@ TEST(AMSelGeneratorTest, MultiSrcSingleDst) {
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
   // Reuse msels for multiple sources.
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID src2 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst1})));
   EXPECT_TRUE(succeeded(generator.solve()));
   EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0));
   EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(0, 0));
-  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 1};
-  PhysPortAndID src4 = {{{0, 1}, {StrmSwPortType::EAST, 0}, PhysPort::Direction::SRC}, 1};
-  PhysPortAndID src5 = {{{0, 1}, {StrmSwPortType::EAST, 1}, PhysPort::Direction::SRC}, 1};
-  PhysPortAndID src6 = {{{0, 1}, {StrmSwPortType::EAST, 2}, PhysPort::Direction::SRC}, 1};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID src3 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID src4 = {
+      {{0, 1}, {StrmSwPortType::EAST, 0}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID src5 = {
+      {{0, 1}, {StrmSwPortType::EAST, 1}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID src6 = {
+      {{0, 1}, {StrmSwPortType::EAST, 2}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID dst2 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst2})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src4, {dst2})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src5, {dst2})));
@@ -144,23 +173,35 @@ TEST(AMSelGeneratorTest, MultiSrcMultiDst) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0};
-  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1};
-  PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID src2 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst2 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst3 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID dst4 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 1};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1, dst2})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst3, dst4})));
   EXPECT_TRUE(succeeded(generator.solve()));
   EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0));
   EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(0, 1));
-  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 2};
-  PhysPortAndID src4 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 3};
-  PhysPortAndID dst5 = {{{0, 1}, {StrmSwPortType::WEST, 0}, PhysPort::Direction::DST}, 2};
-  PhysPortAndID dst6 = {{{0, 1}, {StrmSwPortType::WEST, 1}, PhysPort::Direction::DST}, 2};
-  PhysPortAndID dst7 = {{{0, 1}, {StrmSwPortType::WEST, 0}, PhysPort::Direction::DST}, 3};
-  PhysPortAndID dst8 = {{{0, 1}, {StrmSwPortType::WEST, 2}, PhysPort::Direction::DST}, 3};
+  PhysPortAndID src3 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 2};
+  PhysPortAndID src4 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 3};
+  PhysPortAndID dst5 = {
+      {{0, 1}, {StrmSwPortType::WEST, 0}, PhysPort::Direction::DST}, 2};
+  PhysPortAndID dst6 = {
+      {{0, 1}, {StrmSwPortType::WEST, 1}, PhysPort::Direction::DST}, 2};
+  PhysPortAndID dst7 = {
+      {{0, 1}, {StrmSwPortType::WEST, 0}, PhysPort::Direction::DST}, 3};
+  PhysPortAndID dst8 = {
+      {{0, 1}, {StrmSwPortType::WEST, 2}, PhysPort::Direction::DST}, 3};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst5, dst6})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src4, {dst7, dst8})));
   EXPECT_TRUE(succeeded(generator.solve()));
@@ -174,14 +215,22 @@ TEST(AMSelGeneratorTest, ReuseArbiters) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 1, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
-  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 2}, PhysPort::Direction::SRC}, 2};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0};
-  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 1};
-  PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::NORTH, 3}, PhysPort::Direction::DST}, 1};
-  PhysPortAndID dst5 = {{{0, 1}, {StrmSwPortType::NORTH, 4}, PhysPort::Direction::DST}, 2};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID src2 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID src3 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 2}, PhysPort::Direction::SRC}, 2};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst2 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst3 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID dst4 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 3}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID dst5 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 4}, PhysPort::Direction::DST}, 2};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1, dst2})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst3, dst4})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst5})));
@@ -195,12 +244,18 @@ TEST(AMSelGeneratorTest, ReuseArbitersFailure) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 1, 2);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
-  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 2}, PhysPort::Direction::SRC}, 2};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1};
-  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 2};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID src2 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID src3 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 2}, PhysPort::Direction::SRC}, 2};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst2 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID dst3 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 2};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst2})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst3})));
@@ -211,10 +266,14 @@ TEST(AMSelGeneratorTest, DifferentDirections) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::DMA, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::SRC}, 0};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::DMA, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::DMA, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID src2 = {
+      {{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst2 = {
+      {{0, 1}, {StrmSwPortType::DMA, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst2})));
   EXPECT_TRUE(succeeded(generator.solve()));
diff --git a/runtime/src/iree-amd-aie/aie_runtime/test/test_control_packets.cc b/runtime/src/iree-amd-aie/aie_runtime/test/test_control_packets.cc
index f9c204a1b..20b70e53c 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/test/test_control_packets.cc
+++ b/runtime/src/iree-amd-aie/aie_runtime/test/test_control_packets.cc
@@ -78,11 +78,11 @@ clang-format on
  *memory at ouput address (output_add = 0x4000) .After I am comparing the input
  *and output buffer.
  *
- * @param	None.
+ * @param None.
  *
- * @return	0 on success and error code on failure.
+ * @return 0 on success and error code on failure.
  *
- * @note		None.
+ * @note None.
  ******************************************************************************/
 int main(int argc, char **argv) {
   AieRC RC = XAIE_OK;
diff --git a/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c b/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c
index 515491ef0..37b791425 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c
+++ b/runtime/src/iree-amd-aie/aie_runtime/xaie_hwcfg.c
@@ -4,8 +4,8 @@
 // https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: # Apache-2.0 WITH LLVM-exception
 
-#include "xaiengine/xaiegbl_defs.h"
 #include "xaiengine/xaiegbl.h"
+#include "xaiengine/xaiegbl_defs.h"
 #undef s8
 #undef u8
 #undef u16
@@ -105,7 +105,6 @@ const int XAIEML_NUM_NOC_INTR_OFFSET = XAIE_NUM_NOC_INTR_OFFSET;
 
 const uint64_t XAIEML_PARTITION_BASE_ADDR = XAIE_BASE_ADDR;
 
-
 #undef XAIE_BASE_ADDR
 #undef XAIE_NPI_BASEADDR
 #undef XAIE_NUM_ROWS
diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir
index b7b671f8b..94625bd65 100644
--- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir
+++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir
@@ -35,4 +35,3 @@ hal.executable.source public @amdaie_fb {
     }
   }
 }
-
diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h
index 914feac0a..67a0485d4 100644
--- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h
+++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h
@@ -30,8 +30,8 @@ extern "C" {
  * Example code:
  *
  * struct amdxdna_drm_example_data {
- *	.ext = (uintptr_t)&example_data_ext;
- *	...
+ *  .ext = (uintptr_t)&example_data_ext;
+ *  ...
  * };
  *
  * We don't have extension now. The extension struct will define in the future.
@@ -174,7 +174,7 @@ enum amdxdna_drm_config_hwctx_param {
  *              structure passed in via param_val.
  * @param_val: A structure specified by the param_type struct member.
  * @param_val_size: Size of the parameter buffer pointed to by the param_val.
- *		    If param_val is not a pointer, driver can ignore this.
+ *                  If param_val is not a pointer, driver can ignore this.
  * @pad: Structure padding.
  *
  * Note: if the param_val is a pointer pointing to a buffer, the maximum size
@@ -189,11 +189,11 @@ struct amdxdna_drm_config_hwctx {
 };
 
 /*
- * AMDXDNA_BO_SHMEM:	DRM GEM SHMEM bo
+ * AMDXDNA_BO_SHMEM:  DRM GEM SHMEM bo
  * AMDXDNA_BO_DEV_HEAP: Shared host memory to device as heap memory
- * AMDXDNA_BO_DEV_BO:	Allocated from BO_DEV_HEAP
- * AMDXDNA_BO_CMD:	User and driver accessible bo
- * AMDXDNA_BO_DMA:	DRM GEM DMA bo
+ * AMDXDNA_BO_DEV_BO: Allocated from BO_DEV_HEAP
+ * AMDXDNA_BO_CMD:  User and driver accessible bo
+ * AMDXDNA_BO_DMA:  DRM GEM DMA bo
  */
 enum amdxdna_bo_type {
   AMDXDNA_BO_INVALID = 0,
diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h
index 058b68530..c78797361 100644
--- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h
+++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h
@@ -855,20 +855,23 @@ uint32_t ert_base_addr = 0;
 #define ERT_INTC_CU_0_31_IPR (ERT_INTC_CU_0_31_ADDR + 0x4)  /* pending */
 #define ERT_INTC_CU_0_31_IER (ERT_INTC_CU_0_31_ADDR + 0x8)  /* enable */
 #define ERT_INTC_CU_0_31_IAR (ERT_INTC_CU_0_31_ADDR + 0x0C) /* acknowledge */
-#define ERT_INTC_CU_0_31_MER (ERT_INTC_CU_0_31_ADDR + 0x1C) /* master enable \
-                                                             */
-
-#define ERT_INTC_CU_32_63_IPR (ERT_INTC_CU_32_63_ADDR + 0x4)  /* pending */
-#define ERT_INTC_CU_32_63_IER (ERT_INTC_CU_32_63_ADDR + 0x8)  /* enable */
-#define ERT_INTC_CU_32_63_IAR (ERT_INTC_CU_32_63_ADDR + 0x0C) /* acknowledge \
-                                                               */
+#define ERT_INTC_CU_0_31_MER                      \
+  (ERT_INTC_CU_0_31_ADDR + 0x1C) /* master enable \
+                                  */
+
+#define ERT_INTC_CU_32_63_IPR (ERT_INTC_CU_32_63_ADDR + 0x4) /* pending */
+#define ERT_INTC_CU_32_63_IER (ERT_INTC_CU_32_63_ADDR + 0x8) /* enable */
+#define ERT_INTC_CU_32_63_IAR                    \
+  (ERT_INTC_CU_32_63_ADDR + 0x0C) /* acknowledge \
+                                   */
 #define ERT_INTC_CU_32_63_MER \
   (ERT_INTC_CU_32_63_ADDR + 0x1C) /* master enable */
 
-#define ERT_INTC_CU_64_95_IPR (ERT_INTC_CU_64_95_ADDR + 0x4)  /* pending */
-#define ERT_INTC_CU_64_95_IER (ERT_INTC_CU_64_95_ADDR + 0x8)  /* enable */
-#define ERT_INTC_CU_64_95_IAR (ERT_INTC_CU_64_95_ADDR + 0x0C) /* acknowledge \
-                                                               */
+#define ERT_INTC_CU_64_95_IPR (ERT_INTC_CU_64_95_ADDR + 0x4) /* pending */
+#define ERT_INTC_CU_64_95_IER (ERT_INTC_CU_64_95_ADDR + 0x8) /* enable */
+#define ERT_INTC_CU_64_95_IAR                    \
+  (ERT_INTC_CU_64_95_ADDR + 0x0C) /* acknowledge \
+                                   */
 #define ERT_INTC_CU_64_95_MER \
   (ERT_INTC_CU_64_95_ADDR + 0x1C) /* master enable */
 
diff --git a/runtime/src/iree-amd-aie/driver/xrt/api.h b/runtime/src/iree-amd-aie/driver/xrt/api.h
index 6838edc1a..8a33a6a39 100644
--- a/runtime/src/iree-amd-aie/driver/xrt/api.h
+++ b/runtime/src/iree-amd-aie/driver/xrt/api.h
@@ -48,7 +48,7 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_driver_create(
     iree_allocator_t host_allocator, iree_hal_driver_t** out_driver);
 
 #ifdef __cplusplus
-}       // extern "C"
+}  // extern "C"
 #endif  // __cplusplus
 
 #endif  // IREE_EXPERIMENTAL_XRT_API_H_
diff --git a/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.cc b/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.cc
index d66578fd6..86d164d59 100644
--- a/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.cc
+++ b/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.cc
@@ -41,7 +41,7 @@ iree_status_t iree_hal_xrt_semaphore_create(
                                   &semaphore->base);
     semaphore->host_allocator = host_allocator;
     iree_atomic_store(&semaphore->value, initial_value,
-                            iree_memory_order_release);
+                      iree_memory_order_release);
     *out_semaphore = &semaphore->base;
   }
 
@@ -67,8 +67,7 @@ static iree_status_t iree_hal_xrt_semaphore_query(
   iree_hal_xrt_semaphore_t* semaphore =
       iree_hal_xrt_semaphore_cast(base_semaphore);
   // TODO: Support semaphores completely.
-  *out_value =
-      iree_atomic_load(&semaphore->value, iree_memory_order_acquire);
+  *out_value = iree_atomic_load(&semaphore->value, iree_memory_order_acquire);
   return iree_ok_status();
 }
 
@@ -78,8 +77,7 @@ static iree_status_t iree_hal_xrt_semaphore_signal(
       iree_hal_xrt_semaphore_cast(base_semaphore);
   // TODO: Support semaphores completely. Return OK currently as everything is
   // synchronized for each submit to allow things to run.
-  iree_atomic_store(&semaphore->value, new_value,
-                          iree_memory_order_release);
+  iree_atomic_store(&semaphore->value, new_value, iree_memory_order_release);
   iree_hal_semaphore_poll(&semaphore->base);
   return iree_ok_status();
 }
diff --git a/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.h b/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.h
index 4d47ae3e8..c1f5d05b0 100644
--- a/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.h
+++ b/runtime/src/iree-amd-aie/driver/xrt/nop_semaphore.h
@@ -21,7 +21,7 @@ iree_status_t iree_hal_xrt_semaphore_create(
     iree_hal_semaphore_t** out_semaphore);
 
 #ifdef __cplusplus
-}       // extern "C"
+}  // extern "C"
 #endif  // __cplusplus
 
 #endif  // IREE_AMD_AIE_DRIVER_XRT_NOP_SEMAPHORE_H_
diff --git a/runtime/src/iree-amd-aie/driver/xrt/registration/driver_module.h b/runtime/src/iree-amd-aie/driver/xrt/registration/driver_module.h
index 812d3fa5b..fa24d79a3 100644
--- a/runtime/src/iree-amd-aie/driver/xrt/registration/driver_module.h
+++ b/runtime/src/iree-amd-aie/driver/xrt/registration/driver_module.h
@@ -18,7 +18,7 @@ IREE_API_EXPORT iree_status_t
 iree_hal_xrt_driver_module_register(iree_hal_driver_registry_t* registry);
 
 #ifdef __cplusplus
-}       // extern "C"
+}  // extern "C"
 #endif  // __cplusplus
 
 #endif  // IREE_EXPERIMENTAL_XRT_REGISTRATION_DRIVER_MODULE_H_
diff --git a/runtime/src/iree-amd-aie/driver/xrt/xrt_buffer.h b/runtime/src/iree-amd-aie/driver/xrt/xrt_buffer.h
index e7f39bca9..cc16eb194 100644
--- a/runtime/src/iree-amd-aie/driver/xrt/xrt_buffer.h
+++ b/runtime/src/iree-amd-aie/driver/xrt/xrt_buffer.h
@@ -29,7 +29,7 @@ iree_status_t iree_hal_xrt_buffer_wrap(
 xrt::bo* iree_hal_xrt_buffer_handle(const iree_hal_buffer_t* buffer);
 
 #ifdef __cplusplus
-}       // extern "C"
+}  // extern "C"
 #endif  // __cplusplus
 
 #endif  // IREE_AMD_AIE_DRIVER_XRT_XRT_BUFFER_H_
diff --git a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc
index 566f80e7a..6d678b4ea 100644
--- a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc
+++ b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc
@@ -390,10 +390,10 @@ const iree_hal_device_vtable_t iree_hal_xrt_device_vtable = {
     iree_hal_xrt_device_query_semaphore_compatibility,
     /*.queue_alloca = */ iree_hal_xrt_device_queue_alloca,
     /*.queue_dealloca = */ iree_hal_xrt_device_queue_dealloca,
-    /*.queue_fill=*/ iree_hal_device_queue_emulated_fill,
-    /*.queue_update=*/ iree_hal_device_queue_emulated_update,
-    /*.queue_copy=*/ iree_hal_device_queue_emulated_copy,
-    /*.queue_read=*/ iree_hal_xrt_device_queue_read,
+    /*.queue_fill=*/iree_hal_device_queue_emulated_fill,
+    /*.queue_update=*/iree_hal_device_queue_emulated_update,
+    /*.queue_copy=*/iree_hal_device_queue_emulated_copy,
+    /*.queue_read=*/iree_hal_xrt_device_queue_read,
     /*.queue_write = */ iree_hal_xrt_device_queue_write,
     /*.queue_execute = */ iree_hal_xrt_device_queue_execute,
     /*.queue_flush = */ iree_hal_xrt_device_queue_flush,
diff --git a/runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs b/runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs
index 8d4e49c13..66c1503d5 100644
--- a/runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs
+++ b/runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs
@@ -32,10 +32,10 @@ table ExecutableDef {
   // A map of entry point ordinals to the indices of the containing XCLBINs (the following field).
   // This list has the same size as the entry_points list.
   // This list currently is just a range (0, number of entry points] but will change when we start doing
-  // kernel merging in the backend. 
+  // kernel merging in the backend.
   pdi_indices:[uint32];
 
-  
+
   // PDI strings of the entry points.
   pdis: [PdiDef];
 
diff --git a/runtime/src/iree-amd-aie/schemas/xrt_executable_def.fbs b/runtime/src/iree-amd-aie/schemas/xrt_executable_def.fbs
index 4c69368c6..a8194fa9c 100644
--- a/runtime/src/iree-amd-aie/schemas/xrt_executable_def.fbs
+++ b/runtime/src/iree-amd-aie/schemas/xrt_executable_def.fbs
@@ -31,28 +31,28 @@ table ExecutableDef {
   // A map of entry point ordinals to string names as used in XCLBIN(s)
   entry_points:[string];
 
-  // xclbin container format (also known as AXLF) is defined in file xclbin.h. 
+  // xclbin container format (also known as AXLF) is defined in file xclbin.h.
   // The file uses xclbin2 as the magic word. AXLF is sections based extensible container.
   // Different sections store different parts of compiled application like bitstreams for PL (FPGA fabric),
-  // ELF for AIE tiles and embedded processors like Microblaze. 
-  // It also contains well structured metadata to define memory topology, 
-  // IP layout of instantiated peripherals and compute kernels, 
+  // ELF for AIE tiles and embedded processors like Microblaze.
+  // It also contains well structured metadata to define memory topology,
+  // IP layout of instantiated peripherals and compute kernels,
   // clocking details and kernel connectivity for each compute kernel.
 
-  // The compiler generates unique xclbin file tagged with UUID for every application compiled. 
-  // Each xclbin also has another UUID which defines its compatbility to the Shell. 
-  // Vitis compiler, v++ generates this file as part of linking stage. 
-  // End-users load this file via XRT xclLoadXclbin() API. 
-  // XRT userspace and kernel space components consume different sections of xclbin by 
+  // The compiler generates unique xclbin file tagged with UUID for every application compiled.
+  // Each xclbin also has another UUID which defines its compatbility to the Shell.
+  // Vitis compiler, v++ generates this file as part of linking stage.
+  // End-users load this file via XRT xclLoadXclbin() API.
+  // XRT userspace and kernel space components consume different sections of xclbin by
   // programming the hardware and initializing key data structures in XRT userspace libraries and XRT kernel drivers.
 
   // A map of entry point ordinals to the indices of the containing XCLBINs (the following field).
   // This list has the same size as the entry_points list.
   // This list currently is just a range (0, number of entry points] but will change when we start doing
-  // kernel merging in the backend. 
+  // kernel merging in the backend.
   xclbin_indices:[uint32];
 
-  
+
   // XCLBIN strings of the entry points.
   xclbins: [XclbinDef];