nod-ai · newling · Feb 20, 2025 · Feb 19, 2025 · Feb 19, 2025 · Feb 20, 2025
@@ -32,15 +32,14 @@ namespace {
 
 /// Utility which returns 'true' is the operation needs to be inserted with an
 /// `amdaie.core` op.
-/// Some ops are surrrounded by scf.for loop nests. Place the entire
+/// Some ops are surrounded by scf.for loop nests. Place the entire
 /// loop nest inside the amdaie.core op here. Currently look for a
 /// subset of ops which we know should be in the core.
 /// TODO(newling) improve this design.
 static bool isCoreComputeOp(Operation *op) {
   return isa<linalg::LinalgOp, vector::ContractionOp,
-             memref::ExtractStridedMetadataOp, func::CallOp, arith::ExtFOp,
-             arith::TruncFOp, arith::TruncIOp, vector::TransferReadOp,
-             vector::TransferWriteOp>(op);
+             memref::ExtractStridedMetadataOp, func::CallOp,
+             vector::TransferReadOp, vector::TransferWriteOp>(op);
 }
 
 /// Utility to map the parallel mapping attributes to the corresponding

@@ -300,53 +300,3 @@ module {
     return
   }
 }
-
-// -----
-
-// CHECK-LABEL: @insert_truncf_within_core
-// CHECK:           scf.forall
-// CHECK:             amdaie.tile
-// CHECK:             amdaie.core
-// CHECK:               vector.transfer_read
-// CHECK:               arith.truncf
-// CHECK:               vector.transfer_write
-// CHECK:               amdaie.end
-module {
-  func.func @insert_truncf_within_core(%arg0: memref<10x10xf32, 2 : i32>, %arg1: memref<10x10xbf16, 2 : i32>) {
-    %cst = arith.constant 0.000000e+00 : f32
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c0 = arith.constant 0 : index
-    scf.forall (%arg3, %arg4) in (2, 2) {
-      %read = vector.transfer_read %arg0[%c0, %c1], %cst {in_bounds = [true, true]} : memref<10x10xf32, 2 : i32>, vector<1x1xf32>
-      %truncf = arith.truncf %read : vector<1x1xf32> to vector<1x1xbf16>
-      vector.transfer_write %truncf, %arg1[%c0, %c1] {in_bounds = [true, true]} : vector<1x1xbf16>, memref<10x10xbf16, 2 : i32>
-    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-    return
-  }
-}
-
-// -----
-
-// CHECK-LABEL: @insert_trunci_within_core
-// CHECK:           scf.forall
-// CHECK:             amdaie.tile
-// CHECK:             amdaie.core
-// CHECK:               vector.transfer_read
-// CHECK:               arith.trunci
-// CHECK:               vector.transfer_write
-// CHECK:               amdaie.end
-module {
-  func.func @insert_trunci_within_core(%arg0: memref<10x10xi32, 2 : i32>, %arg1: memref<10x10xi8, 2 : i32>) {
-    %cst = arith.constant 0 : i32
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c0 = arith.constant 0 : index
-    scf.forall (%arg3, %arg4) in (2, 2) {
-      %read = vector.transfer_read %arg0[%c0, %c1], %cst {in_bounds = [true, true]} : memref<10x10xi32, 2 : i32>, vector<1x1xi32>
-      %trunci = arith.trunci %read : vector<1x1xi32> to vector<1x1xi8>
-      vector.transfer_write %trunci, %arg1[%c0, %c1] {in_bounds = [true, true]} : vector<1x1xi8>, memref<10x10xi8, 2 : i32>
-    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-    return
-  }
-}