Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up consumer fusion lit tests #1004

Merged
merged 1 commit into from
Jan 2, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,38 @@ func.func @no_consumer_fusion(%arg0: tensor<64xf32>) -> tensor<64xf32> {

// -----

// CHECK-DAG: #[[UNPACK_RESULT_MAP0:.*]] = affine_map<(d0) -> (d0 * 4)>
// CHECK-DAG: #[[UNPACK_RESULT_MAP1:.*]] = affine_map<(d0) -> (d0 * 8)>
// CHECK-DAG: #[[EXTRACT_SLICE_MAP0:.*]] = affine_map<(d0) -> (32, d0 * -4 + 64)>
// CHECK-DAG: #[[EXTRACT_SLICE_MAP1:.*]] = affine_map<(d0) -> (32, d0 * -8 + 64)>
// CHECK: %[[FINAL:.*]] = scf.forall
// CHECK-SAME: shared_outs(%[[ITER_ARG_FINAL:.*]] = %{{.*}})
// CHECK: {
// CHECK: %[[FIRST_LOOP:.*]] = scf.forall
// CHECK: {
// CHECK: }
// CHECK: %[[SECOND_UNPACK_OUT:.*]] = tensor.empty() : tensor<64x64xi32>
// CHECK: %[[UNPACK_OUT:.*]] = tensor.empty() : tensor<1x1x64x64xi32>
// CHECK: %[[SECOND_LOOP:.*]]:2 = scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) shared_outs(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]])
// CHECK: {
// CHECK: %[[MATMUL:.*]] = linalg.generic
// CHECK-DAG: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
// CHECK-DAG: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]])
// CHECK-DAG: %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]])
// CHECK-DAG: %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV1]])
// CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
// CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
// CHECK: scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
// CHECK: }
// CHECK: }
// CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
// CHECK: scf.forall.in_parallel
// CHECK: tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]]
// CHECK: }
// CHECK: }
// CHECK: return %[[FINAL]]
#map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
Expand Down Expand Up @@ -249,41 +281,51 @@ module {
return %0 : tensor<1024x1024xi32>
}
}
// CHECK-DAG: #[[UNPACK_RESULT_MAP0:.*]] = affine_map<(d0) -> (d0 * 4)>
// CHECK-DAG: #[[UNPACK_RESULT_MAP1:.*]] = affine_map<(d0) -> (d0 * 8)>
// CHECK-DAG: #[[EXTRACT_SLICE_MAP0:.*]] = affine_map<(d0) -> (32, d0 * -4 + 64)>
// CHECK-DAG: #[[EXTRACT_SLICE_MAP1:.*]] = affine_map<(d0) -> (32, d0 * -8 + 64)>
// CHECK: %[[FINAL:.*]] = scf.forall
// CHECK-SAME: shared_outs(%[[ITER_ARG_FINAL:.*]] = %{{.*}})
// CHECK: {
// CHECK: %[[FIRST_LOOP:.*]] = scf.forall
// CHECK: {
// CHECK: }
// CHECK: %[[SECOND_UNPACK_OUT:.*]] = tensor.empty() : tensor<64x64xi32>
// CHECK: %[[UNPACK_OUT:.*]] = tensor.empty() : tensor<1x1x64x64xi32>
// CHECK: %[[SECOND_LOOP:.*]]:2 = scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) shared_outs(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]])
// CHECK: {
// CHECK: %[[MATMUL:.*]] = linalg.generic
// CHECK-DAG: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
// CHECK-DAG: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]])
// CHECK-DAG: %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]])
// CHECK-DAG: %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV1]])
// CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
// CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
// CHECK: scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
// CHECK: }
// CHECK: }
// CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
// CHECK: scf.forall.in_parallel
// CHECK: tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]]
// CHECK: }
// CHECK: }
// CHECK: return %[[FINAL]]

// -----

// CHECK-DAG: #[[UNPACK_RESULT_MAP0:.*]] = affine_map<(d0) -> (d0 * 4)>
// CHECK-DAG: #[[UNPACK_RESULT_MAP1:.*]] = affine_map<(d0) -> (d0 * 8)>
// CHECK-DAG: #[[EXTRACT_SLICE_MAP0:.*]] = affine_map<(d0) -> (32, d0 * -4 + 64)>
// CHECK-DAG: #[[EXTRACT_SLICE_MAP1:.*]] = affine_map<(d0) -> (32, d0 * -8 + 64)>
// CHECK: %[[FINAL:.*]] = scf.forall
// CHECK-SAME: shared_outs(%[[ITER_ARG_FINAL:.*]] = %{{.*}})
// CHECK: {
// CHECK: %[[FIRST_LOOP:.*]] = scf.forall
// CHECK: {
// CHECK: }
// CHECK: %[[ELEM_OUT:.*]] = tensor.empty() : tensor<1x1x8x16x4x8xi32>
// CHECK: %[[SECOND_UNPACK_OUT:.*]] = tensor.empty() : tensor<64x64xi32>
// CHECK: %[[UNPACK_OUT:.*]] = tensor.empty() : tensor<1x1x64x64xi32>
// CHECK: %[[SECOND_LOOP:.*]]:3 = scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) shared_outs(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_2:.*]] = %[[ELEM_OUT]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]])
// CHECK-SAME: {
// CHECK: %[[MATMUL:.*]] = linalg.generic
// CHECK: %[[OPERAND2:.*]] = tensor.extract_slice %[[ELEM_OUT]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
// CHECK: %[[OPERAND3:.*]] = tensor.extract_slice %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
// CHECK: %[[FUSED_CONSUMER:.*]] = linalg.generic
// CHECK-SAME: ins(%[[MATMUL]], %[[OPERAND2]] :
// CHECK-SAME: outs(%[[OPERAND3]] :
// CHECK: {
// CHECK: arith.addi
// CHECK: }
// CHECK-DAG: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
// CHECK-DAG: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]])
// CHECK-DAG: %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]])
// CHECK-DAG: %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV1]])
// CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
// CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
// CHECK: scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
// CHECK: tensor.parallel_insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
// CHECK: }
// CHECK: }
// CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
// CHECK: scf.forall.in_parallel
// CHECK: tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]]
// CHECK: }
// CHECK: }
// CHECK: return %[[FINAL]]
#map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
Expand Down Expand Up @@ -337,48 +379,6 @@ module {
return %0 : tensor<1024x1024xi32>
}
}
// CHECK-DAG: #[[UNPACK_RESULT_MAP0:.*]] = affine_map<(d0) -> (d0 * 4)>
// CHECK-DAG: #[[UNPACK_RESULT_MAP1:.*]] = affine_map<(d0) -> (d0 * 8)>
// CHECK-DAG: #[[EXTRACT_SLICE_MAP0:.*]] = affine_map<(d0) -> (32, d0 * -4 + 64)>
// CHECK-DAG: #[[EXTRACT_SLICE_MAP1:.*]] = affine_map<(d0) -> (32, d0 * -8 + 64)>
// CHECK: %[[FINAL:.*]] = scf.forall
// CHECK-SAME: shared_outs(%[[ITER_ARG_FINAL:.*]] = %{{.*}})
// CHECK: {
// CHECK: %[[FIRST_LOOP:.*]] = scf.forall
// CHECK: {
// CHECK: }
// CHECK: %[[ELEM_OUT:.*]] = tensor.empty() : tensor<1x1x8x16x4x8xi32>
// CHECK: %[[SECOND_UNPACK_OUT:.*]] = tensor.empty() : tensor<64x64xi32>
// CHECK: %[[UNPACK_OUT:.*]] = tensor.empty() : tensor<1x1x64x64xi32>
// CHECK: %[[SECOND_LOOP:.*]]:3 = scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) shared_outs(%[[ITER_ARG_1:.*]] = %[[FIRST_LOOP]], %[[ITER_ARG_2:.*]] = %[[ELEM_OUT]], %[[ITER_ARG_3:.*]] = %[[UNPACK_OUT]])
// CHECK: {
// CHECK: %[[MATMUL:.*]] = linalg.generic
// CHECK: %[[OPERAND2:.*]] = tensor.extract_slice %[[ELEM_OUT]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
// CHECK: %[[OPERAND3:.*]] = tensor.extract_slice %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
// CHECK: %[[FUSED_CONSUMER:.*]] = linalg.generic
// CHECK-SAME: ins(%[[MATMUL]], %[[OPERAND2]] :
// CHECK-SAME: outs(%[[OPERAND3]] :
// CHECK: {
// CHECK: arith.addi
// CHECK: }
// CHECK-DAG: %[[iv0:.*]] = affine.apply #[[UNPACK_RESULT_MAP0]](%[[IV0]])
// CHECK-DAG: %[[iv1:.*]] = affine.apply #[[UNPACK_RESULT_MAP1]](%[[IV1]])
// CHECK-DAG: %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]])
// CHECK-DAG: %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV1]])
// CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
// CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
// CHECK: scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
// CHECK: tensor.parallel_insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
// CHECK: }
// CHECK: }
// CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
// CHECK: scf.forall.in_parallel
// CHECK: tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]]
// CHECK: }
// CHECK: }
// CHECK: return %[[FINAL]]

// -----

Expand Down Expand Up @@ -478,26 +478,20 @@ module {
// CHECK-DAG: tensor.parallel_insert_slice %[[FORALL_1]]#1 into %[[UNPACK_OUT]][%[[ARG0]], %[[ARG1]], 0, 0] [4, 4, 32, 32] [1, 1, 1, 1]
// CHECK: }
// CHECK: tensor.unpack %[[FORALL_0]]#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[EXTRACT_SLICE_0]]
#config = #iree_codegen.lowering_config<tile_sizes = [[256, 256], [4, 4, 0], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
#map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d2, d4, d5, d8, d7)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
#packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1], unpackEmpty = [false, false], innerPerm = [[0, 1], [1, 0]], outerPerm = [[0, 1], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
module {
// expected-error @+1 {{Maximum number of iterations reached, consumer fusion is likely stuck in an infinite loop.}}
func.func @matmul_multiple_fusion_iterations() {
func.func @matmul_multiple_fusion_iterations() -> tensor<512x4096xf32> {
%c0 = arith.constant 0 : index
%alloc = memref.alloc() : memref<1x1x8x8x8x4xbf16, 2 : i32>
%alloc_0 = memref.alloc() : memref<1x1x8x8x4x8xbf16, 2 : i32>
%alloc_1 = memref.alloc() : memref<8x8x32x32xf32, 1 : i32>
%alloc_2 = memref.alloc() : memref<8x8x64x32xbf16, 1 : i32>
%alloc_3 = memref.alloc() : memref<8x8x32x64xbf16, 1 : i32>
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<512x512xbf16>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<512x4096xbf16>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<512x4096xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x512xbf16>> -> tensor<512x512xbf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x4096xbf16>> -> tensor<512x4096xbf16>
%3 = tensor.empty() : tensor<512x512xbf16>
%4 = tensor.empty() : tensor<512x4096xbf16>
%5 = tensor.empty() : tensor<512x4096xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (512, 4096) step (256, 256) shared_outs(%arg2 = %5) -> (tensor<512x4096xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [256, 512] [1, 1] : tensor<512x512xbf16> to tensor<256x512xbf16>
Expand All @@ -523,7 +517,7 @@ module {
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x8x8x4xbf16, 2 : i32> to tensor<1x1x8x8x8x4xbf16>
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x64x32xbf16> -> tensor<1x1x8x8x8x4xbf16>
%extracted_slice_16 = tensor.extract_slice %arg8[%arg6, %arg7, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<4x4x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32>
%16 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x8x8x4x8xbf16>, tensor<1x1x8x8x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #config, packing_config = #packingConfig} {
%16 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x8x8x4x8xbf16>, tensor<1x1x8x8x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xf32>) {
^bb0(%in: bf16, %in_17: bf16, %out: f32):
%17 = arith.extf %in : bf16 to f32
%18 = arith.extf %in_17 : bf16 to f32
Expand All @@ -545,12 +539,11 @@ module {
tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [256, 256] [1, 1] : tensor<256x256xf32> into tensor<512x4096xf32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [512, 4096], strides = [1, 1] : tensor<512x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x4096xf32>>
memref.dealloc %alloc_3 : memref<8x8x32x64xbf16, 1 : i32>
memref.dealloc %alloc_2 : memref<8x8x64x32xbf16, 1 : i32>
memref.dealloc %alloc_1 : memref<8x8x32x32xf32, 1 : i32>
memref.dealloc %alloc_0 : memref<1x1x8x8x4x8xbf16, 2 : i32>
memref.dealloc %alloc : memref<1x1x8x8x8x4xbf16, 2 : i32>
return
return %6 : tensor<512x4096xf32>
}
}
Loading