Update mlir-air and mlir-aie version; add bf16 conv2d board test (nod…

…-ai#503)
erwei-xilinx · Jul 5, 2024 · 485f670 · 485f670
1 parent 2210796
commit 485f670
Show file tree

Hide file tree

Showing 22 changed files with 98 additions and 89 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -103,7 +103,7 @@ jobs:
         run: |
           python3 -m venv .venv
           source .venv/bin/activate
-          pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024062422+7917990-py3-none-manylinux_2_35_x86_64.whl 
+          pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024070222+76460fe-py3-none-manylinux_2_35_x86_64.whl
 
           pip install -r tests/matmul/requirements.txt
 

diff --git a/build_tools/ci/cpu_comparison/run_test.sh b/build_tools/ci/cpu_comparison/run_test.sh
@@ -326,9 +326,6 @@ function run_test() {
 # Example of running a test directly from an .mlir file with a function.
 run_test --test_file ${THIS_DIR}/test_files/matmul_int32.mlir
 
-# Example of running a test directly from an .mlir file with a function.
-run_test --test_file ${THIS_DIR}/test_files/conv_int32.mlir --pipeline "conv-decompose"
-
 # An example of an arbitrary graph with three matmuls which form three dispatches.
 run_test --test_file ${THIS_DIR}/test_files/three_matmuls.mlir
 
@@ -358,4 +355,7 @@ generate_matmul_test \
    --m "1024"  --n "1024" --k "512"
 run_test --test_file ${test_name} --pipeline "pack-peel"
 
+# Conv2d tests.
+run_test --test_file ${THIS_DIR}/test_files/conv_int32.mlir --pipeline "conv-decompose"
+run_test --test_file ${THIS_DIR}/test_files/conv_bf16.mlir --pipeline "conv-decompose"
 
diff --git a/build_tools/ci/cpu_comparison/test_files/conv_bf16.mlir b/build_tools/ci/cpu_comparison/test_files/conv_bf16.mlir
@@ -0,0 +1,11 @@
+// These 2 lines are required by the script which generates input data:
+// input 2x14x14x32xbf16
+// input 3x3x32x64xbf16
+
+func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x32x64xbf16>) -> tensor<2x12x12x64xf32> {
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.empty() : tensor<2x12x12x64xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
+  %2 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x32xbf16>, tensor<3x3x32x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
+  return %2 : tensor<2x12x12x64xf32>
+}
diff --git a/build_tools/ci/print_ir_aie2xclbin/basic_dma_transpose.mlir b/build_tools/ci/print_ir_aie2xclbin/basic_dma_transpose.mlir
@@ -24,8 +24,8 @@ module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#
             aie.end
           }
           func.func @dummy2(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) {
-            aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<4096xi32>
-            aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 64, 64, 1][1, 1, 64]) {id = 1 : i64, metadata = @in} : memref<4096xi32>
+            aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<4096xi32>
+            aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 64, 64, 1][1, 1, 64, 1]) {id = 1 : i64, metadata = @in} : memref<4096xi32>
             aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
           }

diff --git a/build_tools/ci/print_ir_aie2xclbin/buffers_xclbin.mlir b/build_tools/ci/print_ir_aie2xclbin/buffers_xclbin.mlir
@@ -113,14 +113,14 @@ module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#
           aie.shim_dma_allocation @out2(S2MM, 2, 0)
 
           func.func @dummy2(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>, %arg3: memref<1024xi32>, %arg4: memref<1024xi32>, %arg5: memref<1024xi32>) {
-            aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 0 : i64, metadata = @in0} : memref<1024xi32>
-            aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @out0} : memref<1024xi32>
+            aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, metadata = @in0} : memref<1024xi32>
+            aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 1 : i64, metadata = @out0} : memref<1024xi32>
             aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-            aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 2 : i64, metadata = @in1} : memref<1024xi32>
-            aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 3 : i64, metadata = @out1} : memref<1024xi32>
+            aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 2 : i64, metadata = @in1} : memref<1024xi32>
+            aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 3 : i64, metadata = @out1} : memref<1024xi32>
             aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-            aiex.npu.dma_memcpy_nd(0, 0, %arg4[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 2 : i64, metadata = @in2} : memref<1024xi32>
-            aiex.npu.dma_memcpy_nd(0, 0, %arg5[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 3 : i64, metadata = @out2} : memref<1024xi32>
+            aiex.npu.dma_memcpy_nd(0, 0, %arg4[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 2 : i64, metadata = @in2} : memref<1024xi32>
+            aiex.npu.dma_memcpy_nd(0, 0, %arg5[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 3 : i64, metadata = @out2} : memref<1024xi32>
             aiex.npu.sync {channel = 0 : i32, column = 2 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
           }

diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
@@ -312,7 +312,7 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     auto issue_token = BoolAttr::get(ctx, false);
     auto repeat_count = zero;
 
-    llvm::SmallVector<int64_t, 3> strides = llvm::map_to_vector(
+    llvm::SmallVector<int64_t, 4> strides = llvm::map_to_vector(
         llvm::reverse(op.getMixedStrides()),
         [](OpFoldResult s) { return getConstantIntValue(s).value(); });
     llvm::SmallVector<int64_t, 4> sizes = llvm::map_to_vector(
@@ -359,25 +359,25 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     buffer_offset = IntegerAttr::get(i32ty, offset);
 
     // d0_size
-    if (strides[0]) d0_size = IntegerAttr::get(i32ty, sizes[0]);
+    if (strides[1]) d0_size = IntegerAttr::get(i32ty, sizes[0]);
 
     // d0_stride
-    d0_stride = IntegerAttr::get(i32ty, 0);
+    if (strides[0]) d0_stride = IntegerAttr::get(i32ty, strides[0] - 1);
 
     // d1_size
-    if (strides[1]) d1_size = IntegerAttr::get(i32ty, sizes[1]);
+    if (strides[2]) d1_size = IntegerAttr::get(i32ty, sizes[1]);
 
     // d1_stride
-    if (strides[0]) d1_stride = IntegerAttr::get(i32ty, strides[0] - 1);
+    if (strides[1]) d1_stride = IntegerAttr::get(i32ty, strides[1] - 1);
 
     // d2_stride
-    if (strides[1]) d2_stride = IntegerAttr::get(i32ty, strides[1] - 1);
+    if (strides[2]) d2_stride = IntegerAttr::get(i32ty, strides[2] - 1);
 
     // iteration_size
-    if (strides[2]) iteration_size = IntegerAttr::get(i32ty, sizes[3] - 1);
+    if (strides[3]) iteration_size = IntegerAttr::get(i32ty, sizes[3] - 1);
 
     // iteration_stride
-    if (strides[2]) iteration_stride = IntegerAttr::get(i32ty, strides[2] - 1);
+    if (strides[3]) iteration_stride = IntegerAttr::get(i32ty, strides[3] - 1);
 
     // valid_bd
     valid_bd = IntegerAttr::get(i32ty, 1);

diff --git a/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir b/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir
@@ -29,8 +29,8 @@ module {
       %c8 = arith.constant 8 : i64
       %c16 = arith.constant 16 : i64
       %c32 = arith.constant 32 : i64
-      aiex.npu.dma_memcpy_nd(0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0]) { metadata = @of_toMem, id = 1 : i64 } : memref<64xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<4x2x8xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0,%c1]) { metadata = @of_toMem, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<4x2x8xi32>
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)

diff --git a/compiler/plugins/target/AMD-AIE/aie/test/aiex_standard_lowering.mlir b/compiler/plugins/target/AMD-AIE/aie/test/aiex_standard_lowering.mlir
@@ -13,7 +13,7 @@ module  {
   aie.device(npu1_4col) {
     memref.global "public" @toMem : memref<16xi32>
     func.func @dma_and_wait(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
       aiex.npu.dma_wait {symbol = @toMem}
       return
     }

diff --git a/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir b/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir
@@ -16,8 +16,8 @@ module  {
     memref.global "public" @toMem : memref<16xi32>
     memref.global "public" @fromMem : memref<16xi32>
     func.func @dma_memcpy_nd_0(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
-      aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+      aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
       return
     }
     aie.shim_dma_allocation @fromMem (MM2S, 0, 0)
@@ -39,7 +39,7 @@ module  {
   aie.device(npu1_4col) {
     memref.global "public" @toMem : memref<16xi32>
     func.func @dma_wait_s2mm(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
       aiex.npu.dma_wait {symbol = @toMem}
       return
     }
@@ -61,7 +61,7 @@ module  {
   aie.device(npu1_4col) {
     memref.global "public" @toMem : memref<16xi32>
     func.func @dma_wait_mm2s(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
       aiex.npu.dma_wait {symbol = @toMem}
       return
     }

diff --git a/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu_issue_token.mlir b/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu_issue_token.mlir
@@ -16,8 +16,8 @@ module  {
     memref.global "public" @toMem : memref<16xi32>
     memref.global "public" @fromMem : memref<16xi32>
     func.func @test1(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-        aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32>
-        aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
+        aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32>
+        aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
         return
     }
     aie.shim_dma_allocation @fromMem (MM2S, 0, 0)

diff --git a/...AIE/iree-amd-aie/Target/tests/cdo/matmul_16x16_8xi32__dispatch_0_matmul_16x1_0.aiecc.mlir b/...AIE/iree-amd-aie/Target/tests/cdo/matmul_16x16_8xi32__dispatch_0_matmul_16x1_0.aiecc.mlir
@@ -155,9 +155,9 @@ aie.device(npu1_4col) {
     memref.assume_alignment %arg0, 64 : memref<16x8xi32>
     memref.assume_alignment %arg1, 64 : memref<8x16xi32>
     memref.assume_alignment %arg2, 64 : memref<16x16xi32>
-    aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 8][0, 0, 8]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x8xi32>
-    aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 8, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<8x16xi32>
-    aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
+    aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 8][0, 0, 8, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x8xi32>
+    aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 8, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<8x16xi32>
+    aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
     aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
     return
   }

diff --git a/...AIE/iree-amd-aie/Target/tests/cdo/matmul_16x64_32xi8__dispatch_0_matmul_tran_0.aiecc.mlir b/...AIE/iree-amd-aie/Target/tests/cdo/matmul_16x64_32xi8__dispatch_0_matmul_tran_0.aiecc.mlir
@@ -155,9 +155,9 @@ aie.device(npu1_4col) {
     memref.assume_alignment %arg0, 64 : memref<256xi32>
     memref.assume_alignment %arg1, 64 : memref<512xi32>
     memref.assume_alignment %arg2, 64 : memref<16x32xi32>
-    aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<256xi32>
-    aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 32, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<512xi32>
-    aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 32][0, 0, 32]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x32xi32>
+    aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<256xi32>
+    aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 32, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<512xi32>
+    aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 32][0, 0, 32, 1]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x32xi32>
     aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
     return
   }