From a1fd889a9fc12f3172ae19813655f8619f6c382b Mon Sep 17 00:00:00 2001
From: Jorn Tuyls <jorn.tuyls@gmail.com>
Date: Wed, 11 Dec 2024 02:23:18 -0800
Subject: [PATCH] Refactor BufferizeOperand naming for clarity

---
 .../AMDAIEBufferizeToAllocation.cpp           |   6 +-
 .../iree-amd-aie/Transforms/KernelDispatch.h  |   2 +-
 .../iree-amd-aie/Transforms/Passes.cpp        |  18 +-
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.td |   8 +-
 .../test/bufferize_to_allocation.mlir         | 168 +++++++++---------
 5 files changed, 101 insertions(+), 101 deletions(-)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
index 0931811d5..0ce3e3704 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
@@ -103,13 +103,13 @@ static FailureOr<SmallVector<Value>> getOperandsToBufferize(
     uint32_t packDepth) {
   switch (bufferizeOperand) {
     /// Create new allocations for Lhs, Rhs and Out.
-    case BufferizeOperand::InputOutput:
+    case BufferizeOperand::LinalgInputOutput:
       return getInputOutputOperands(linalgOp);
     /// Create new allocation only for Lhs, Rhs.
-    case BufferizeOperand::Input:
+    case BufferizeOperand::LinalgInput:
       return getInputOperands(linalgOp);
     /// Create new allocations only for Out.
-    case BufferizeOperand::Output:
+    case BufferizeOperand::LinalgOutput:
       return SmallVector<Value>(linalgOp.getDpsInits());
     /// Create new allocations for operands from the pack ops.
     case BufferizeOperand::PackInput:
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h
index 447d4ca90..e13e28dda 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.h
@@ -34,7 +34,7 @@ enum class TilePassPipeline {
 enum class PeelingType { First, Last, FirstLast };
 
 /// Enum for operands to be bufferized to allocation.
-enum class BufferizeOperand { InputOutput, Input, Output, PackInput };
+enum class BufferizeOperand { LinalgInputOutput, LinalgInput, LinalgOutput, PackInput };
 
 LogicalResult initAIELaunchConfig(FunctionOpInterface funcOp,
                                   TilePassPipeline useTilePipeline,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 9ece915fe..3ed93262e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -155,7 +155,7 @@ void addPackPeelBasedPassPipeline(OpPassManager &funcPassManager,
   {
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
     bufferizeOptions.memorySpace = 1;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::Output;
+    bufferizeOptions.bufferizeOperand = BufferizeOperand::LinalgOutput;
     funcPassManager.addPass(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
   }
@@ -165,7 +165,7 @@ void addPackPeelBasedPassPipeline(OpPassManager &funcPassManager,
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
     bufferizeOptions.memorySpace = 1;
     bufferizeOptions.bufferizeElementwise = true;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::Input;
+    bufferizeOptions.bufferizeOperand = BufferizeOperand::LinalgInput;
     funcPassManager.addPass(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
   }
@@ -241,7 +241,7 @@ void addPackPeelBasedPassPipeline(OpPassManager &funcPassManager,
   {
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
     bufferizeOptions.memorySpace = 2;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::Input;
+    bufferizeOptions.bufferizeOperand = BufferizeOperand::LinalgInput;
     funcPassManager.addPass(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
   }
@@ -280,7 +280,7 @@ void addPackPeelBasedPassPipeline(OpPassManager &funcPassManager,
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
     bufferizeOptions.memorySpace = 2;
     bufferizeOptions.bufferizeElementwise = true;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::Input;
+    bufferizeOptions.bufferizeOperand = BufferizeOperand::LinalgInput;
     funcPassManager.addPass(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
   }
@@ -325,7 +325,7 @@ void addPadPackBasedPassPipeline(OpPassManager &funcPassManager,
   {
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
     bufferizeOptions.memorySpace = 1;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::InputOutput;
+    bufferizeOptions.bufferizeOperand = BufferizeOperand::LinalgInputOutput;
     funcPassManager.addPass(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
   }
@@ -362,7 +362,7 @@ void addPadPackBasedPassPipeline(OpPassManager &funcPassManager,
   {
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
     bufferizeOptions.memorySpace = 2;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::Output;
+    bufferizeOptions.bufferizeOperand = BufferizeOperand::LinalgOutput;
     funcPassManager.addPass(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
   }
@@ -388,7 +388,7 @@ void addPadPackBasedPassPipeline(OpPassManager &funcPassManager,
   {
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
     bufferizeOptions.memorySpace = 2;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::Input;
+    bufferizeOptions.bufferizeOperand = BufferizeOperand::LinalgInput;
     funcPassManager.addPass(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
   }
@@ -438,7 +438,7 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager,
   {
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
     bufferizeOptions.memorySpace = 1;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::InputOutput;
+    bufferizeOptions.bufferizeOperand = BufferizeOperand::LinalgInputOutput;
     funcPassManager.addPass(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
   }
@@ -466,7 +466,7 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager,
   {
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
     bufferizeOptions.memorySpace = 2;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::InputOutput;
+    bufferizeOptions.bufferizeOperand = BufferizeOperand::LinalgInputOutput;
     funcPassManager.addPass(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
     addCleanups();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 171326d82..f7ac2c8a1 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -90,14 +90,14 @@ def AMDAIEBufferizeToAllocation :
       "Indicator of whether the target op for bufferization is an elementwise op">,
     Option<"bufferizeOperand", "bufferize-operand",
       "mlir::iree_compiler::AMDAIE::BufferizeOperand",
-      /*default=*/"mlir::iree_compiler::AMDAIE::BufferizeOperand::InputOutput",
+      /*default=*/"mlir::iree_compiler::AMDAIE::BufferizeOperand::LinalgInputOutput",
       "Select which operands of a linalg op to be bufferized to allocation",
       [{::llvm::cl::values(
-        clEnumValN(mlir::iree_compiler::AMDAIE::BufferizeOperand::InputOutput, "input-output",
+        clEnumValN(mlir::iree_compiler::AMDAIE::BufferizeOperand::LinalgInputOutput, "linalg-input-output",
                    "Create new allocations for lhs, rhs and output of a linalg op."),
-        clEnumValN(mlir::iree_compiler::AMDAIE::BufferizeOperand::Input, "input",
+        clEnumValN(mlir::iree_compiler::AMDAIE::BufferizeOperand::LinalgInput, "linalg-input",
                    "Create new allocations for lhs, rhs of a linalg op."),
-        clEnumValN(mlir::iree_compiler::AMDAIE::BufferizeOperand::Output, "output",
+        clEnumValN(mlir::iree_compiler::AMDAIE::BufferizeOperand::LinalgOutput, "linalg-output",
                    "Create new allocations for output of a linalg op."),
         clEnumValN(mlir::iree_compiler::AMDAIE::BufferizeOperand::PackInput, "pack-input",
                    "Create new allocations for operands from the pack op inputs of a linalg op.")
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir
index a54bfda1e..98f6829ad 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir
@@ -1,9 +1,9 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=2 bufferize-operand=input-output}))' --split-input-file %s | FileCheck %s --check-prefix=INPUT-OUTPUT
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=2 bufferize-operand=input}))' --split-input-file %s | FileCheck %s --check-prefix=INPUT
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=2 bufferize-operand=output}))' --split-input-file %s | FileCheck %s --check-prefix=OUTPUT
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=1 bufferize-operand=pack-input pack-depth=2}))' --split-input-file %s | FileCheck %s --check-prefix=PACK
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=2 bufferize-elementwise=true bufferize-operand=input}))' --split-input-file %s | FileCheck %s --check-prefix=ELEMENTWISE-INPUT
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=2 bufferize-elementwise=true bufferize-operand=input-output}))' --split-input-file %s | FileCheck %s --check-prefix=ELEMENTWISE-INPUT-OUTPUT
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=2 bufferize-operand=linalg-input-output}))' --split-input-file %s | FileCheck %s --check-prefix=LINALG-INPUT-OUTPUT
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=2 bufferize-operand=linalg-input}))' --split-input-file %s | FileCheck %s --check-prefix=LINALG-INPUT
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=2 bufferize-operand=linalg-output}))' --split-input-file %s | FileCheck %s --check-prefix=LINALG-OUTPUT
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=1 bufferize-operand=pack-input pack-depth=2}))' --split-input-file %s | FileCheck %s --check-prefix=PACK-INPUT
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=2 bufferize-elementwise=true bufferize-operand=linalg-input}))' --split-input-file %s | FileCheck %s --check-prefix=ELEMENTWISE-INPUT
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-bufferize-to-allocation{memory-space=2 bufferize-elementwise=true bufferize-operand=linalg-input-output}))' --split-input-file %s | FileCheck %s --check-prefix=ELEMENTWISE-INPUT-OUTPUT
 
 #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d3, d5, d6, d8)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d5, d4, d7, d8)>
@@ -35,61 +35,61 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x
     return %unpack_3 : tensor<1024x512xi32>
 }
 
-// INPUT-OUTPUT-NOT:  memref.alloc
-//     INPUT-OUTPUT:  tensor.pack
-// INPUT-OUTPUT-NOT:  memref.alloc
-//     INPUT-OUTPUT:  tensor.pack
-//     INPUT-OUTPUT:  memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32>
-//     INPUT-OUTPUT:  bufferization.to_tensor
-//     INPUT-OUTPUT:  tensor.pack
-//     INPUT-OUTPUT:  memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32>
-//     INPUT-OUTPUT:  bufferization.to_tensor
-//     INPUT-OUTPUT:  tensor.pack
-//     INPUT-OUTPUT:  memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32>
-//     INPUT-OUTPUT:  bufferization.to_tensor
-//     INPUT-OUTPUT:  linalg.fill
-//     INPUT-OUTPUT:  linalg.generic
+// LINALG-INPUT-OUTPUT-NOT:  memref.alloc
+// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT-NOT:  memref.alloc
+// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32>
+// LINALG-INPUT-OUTPUT:      bufferization.to_tensor
+// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32>
+// LINALG-INPUT-OUTPUT:      bufferization.to_tensor
+// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32>
+// LINALG-INPUT-OUTPUT:      bufferization.to_tensor
+// LINALG-INPUT-OUTPUT:      linalg.fill
+// LINALG-INPUT-OUTPUT:      linalg.generic
 
-// INPUT-NOT:  memref.alloc
-//     INPUT:  tensor.pack
-// INPUT-NOT:  memref.alloc
-//     INPUT:  tensor.pack
-//     INPUT:  memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32>
-//     INPUT:  bufferization.to_tensor
-//     INPUT:  tensor.pack
-//     INPUT:  memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32>
-//     INPUT:  bufferization.to_tensor
-//     INPUT:  tensor.pack
-// INPUT-NOT:  memref.alloc
-//     INPUT:  linalg.fill
-//     INPUT:  linalg.generic
+// LINALG-INPUT-NOT:  memref.alloc
+// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT-NOT:  memref.alloc
+// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT:      memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32>
+// LINALG-INPUT:      bufferization.to_tensor
+// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT:      memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32>
+// LINALG-INPUT:      bufferization.to_tensor
+// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT-NOT:  memref.alloc
+// LINALG-INPUT:      linalg.fill
+// LINALG-INPUT:      linalg.generic
 
-// OUTPUT-NOT:  memref.alloc
-//     OUTPUT:  tensor.pack
-// OUTPUT-NOT:  memref.alloc
-//     OUTPUT:  tensor.pack
-// OUTPUT-NOT:  memref.alloc
-//     OUTPUT:  tensor.pack
-// OUTPUT-NOT:  memref.alloc
-//     OUTPUT:  tensor.pack
-//     OUTPUT:  memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32>
-//     OUTPUT:  bufferization.to_tensor
-//     OUTPUT:  linalg.fill
-//     OUTPUT:  linalg.generic
+// LINALG-OUTPUT-NOT:  memref.alloc
+// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT-NOT:  memref.alloc
+// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT-NOT:  memref.alloc
+// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT-NOT:  memref.alloc
+// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT:      memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32>
+// LINALG-OUTPUT:      bufferization.to_tensor
+// LINALG-OUTPUT:      linalg.fill
+// LINALG-OUTPUT:      linalg.generic
 
-// PACK:      memref.alloc() : memref<16x32x64x64xi32, 1 : i32>
-// PACK:      bufferization.to_tensor
-// PACK:      tensor.pack
-// PACK:      memref.alloc() : memref<32x8x64x64xi32, 1 : i32>
-// PACK:      bufferization.to_tensor
-// PACK:      tensor.pack
-// PACK-NOT:  memref.alloc
-// PACK:      tensor.pack
-// PACK-NOT:  memref.alloc
-// PACK:      tensor.pack
-// PACK-NOT:  memref.alloc
-// PACK:      linalg.fill
-// PACK:      linalg.generic
+// PACK-INPUT:      memref.alloc() : memref<16x32x64x64xi32, 1 : i32>
+// PACK-INPUT:      bufferization.to_tensor
+// PACK-INPUT:      tensor.pack
+// PACK-INPUT:      memref.alloc() : memref<32x8x64x64xi32, 1 : i32>
+// PACK-INPUT:      bufferization.to_tensor
+// PACK-INPUT:      tensor.pack
+// PACK-INPUT-NOT:  memref.alloc
+// PACK-INPUT:      tensor.pack
+// PACK-INPUT-NOT:  memref.alloc
+// PACK-INPUT:      tensor.pack
+// PACK-INPUT-NOT:  memref.alloc
+// PACK-INPUT:      linalg.fill
+// PACK-INPUT:      linalg.generic
 
 // -----
 
@@ -143,31 +143,31 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024
   return %1 : tensor<1024x1024xi32>
 }
 
-// ELEMENTWISE-INPUT-COUNT-4:  tensor.pack
-//         ELEMENTWISE-INPUT:  linalg.fill
-//         ELEMENTWISE-INPUT:  linalg.generic
-//     ELEMENTWISE-INPUT-NOT:  memref.alloc
-//         ELEMENTWISE-INPUT:  tensor.pack
-//     ELEMENTWISE-INPUT-NOT:  memref.alloc
-//         ELEMENTWISE-INPUT:  tensor.pack
-//     ELEMENTWISE-INPUT-NOT:  memref.alloc
-//         ELEMENTWISE-INPUT:  tensor.pack
-//         ELEMENTWISE-INPUT:  memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
-//         ELEMENTWISE-INPUT:  bufferization.to_tensor
-//         ELEMENTWISE-INPUT:  tensor.pack
-//         ELEMENTWISE-INPUT:  linalg.generic
+// ELEMENTWISE-INPUT-COUNT-4: tensor.pack
+// ELEMENTWISE-INPUT:         linalg.fill
+// ELEMENTWISE-INPUT:         linalg.generic
+// ELEMENTWISE-INPUT-NOT:     memref.alloc
+// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT-NOT:     memref.alloc
+// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT-NOT:     memref.alloc
+// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT:         memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
+// ELEMENTWISE-INPUT:         bufferization.to_tensor
+// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT:         linalg.generic
 
 // ELEMENTWISE-INPUT-OUTPUT-COUNT-4:  tensor.pack
-//         ELEMENTWISE-INPUT-OUTPUT:  linalg.fill
-//         ELEMENTWISE-INPUT-OUTPUT:  linalg.generic
-//     ELEMENTWISE-INPUT-OUTPUT-NOT:  memref.alloc
-//         ELEMENTWISE-INPUT-OUTPUT:  tensor.pack
-//     ELEMENTWISE-INPUT-OUTPUT-NOT:  memref.alloc
-//         ELEMENTWISE-INPUT-OUTPUT:  tensor.pack
-//         ELEMENTWISE-INPUT-OUTPUT:  memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
-//         ELEMENTWISE-INPUT-OUTPUT:  bufferization.to_tensor
-//         ELEMENTWISE-INPUT-OUTPUT:  tensor.pack
-//         ELEMENTWISE-INPUT-OUTPUT:  memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
-//         ELEMENTWISE-INPUT-OUTPUT:  bufferization.to_tensor
-//         ELEMENTWISE-INPUT-OUTPUT:  tensor.pack
-//         ELEMENTWISE-INPUT-OUTPUT:  linalg.generic
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.fill
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.generic
+// ELEMENTWISE-INPUT-OUTPUT-NOT:      memref.alloc
+// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT-NOT:      memref.alloc
+// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
+// ELEMENTWISE-INPUT-OUTPUT:          bufferization.to_tensor
+// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
+// ELEMENTWISE-INPUT-OUTPUT:          bufferization.to_tensor
+// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.generic