@@ -53,6 +53,31 @@ func.func @vector_load_i4(%arg1: index, %arg2: index) -> vector<3x8xi4> {
53
53
54
54
// -----
55
55
56
+ func.func @vector_load_f4 (%arg1: index , %arg2: index ) -> vector <3 x8 xf4 E2 M1 FN> {
57
+ %0 = memref.alloc () : memref <3 x8 xf4 E2 M1 FN>
58
+ %cst = arith.constant dense <0.0 > : vector <3 x8 xf4 E2 M1 FN>
59
+ %1 = vector.load %0 [%arg1 , %arg2 ] : memref <3 x8 xf4 E2 M1 FN>, vector <8 xf4 E2 M1 FN>
60
+ %2 = vector.insert %1 , %cst [0 ] : vector <8 xf4 E2 M1 FN> into vector <3 x8 xf4 E2 M1 FN>
61
+ return %2 : vector <3 x8 xf4 E2 M1 FN>
62
+ }
63
+ // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
64
+ // CHECK: func @vector_load_f4
65
+ // CHECK-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
66
+ // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
67
+ // CHECK: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
68
+ // CHECK: %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<12xi8>, vector<4xi8>
69
+ // CHECK: %[[VEC_F4:.+]] = vector.bitcast %[[VEC]] : vector<4xi8> to vector<8xf4E2M1FN>
70
+
71
+ // CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
72
+ // CHECK32: func @vector_load_f4
73
+ // CHECK32-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
74
+ // CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
75
+ // CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
76
+ // CHECK32: %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<3xi32>, vector<1xi32>
77
+ // CHECK32: %[[VEC_F4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<8xf4E2M1FN>
78
+
79
+ // -----
80
+
56
81
func.func @vector_load_i4_dynamic (%arg0 : index , %arg1 : index , %arg2 : index , %arg3 : index ) -> vector <8 xi4 > {
57
82
%0 = memref.alloc (%arg0 , %arg1 ) : memref <?x?xi4 >
58
83
%1 = vector.load %0 [%arg2 , %arg3 ] : memref <?x?xi4 >, vector <8 xi4 >
@@ -119,6 +144,37 @@ func.func @vector_transfer_read_i4(%arg1: index, %arg2: index) -> vector<8xi4> {
119
144
120
145
// -----
121
146
147
+ func.func @vector_transfer_read_f4 (%arg1: index , %arg2: index ) -> vector <8 xf4 E2 M1 FN> {
148
+ %c0 = arith.constant 0.0 : f4E2M1FN
149
+ %0 = memref.alloc () : memref <3 x8 xf4 E2 M1 FN>
150
+ %1 = vector.transfer_read %0 [%arg1 , %arg2 ], %c0 {in_bounds = [true ]} :
151
+ memref <3 x8 xf4 E2 M1 FN>, vector <8 xf4 E2 M1 FN>
152
+ return %1 : vector <8 xf4 E2 M1 FN>
153
+ }
154
+ // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
155
+ // CHECK: func @vector_transfer_read_f4
156
+ // CHECK-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
157
+ // CHECK: %[[CONST:.+]] = arith.constant 0.{{0+}}e+00 : f4E2M1FN
158
+ // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
159
+ // CHECK: %[[BC:.+]] = arith.bitcast %[[CONST]] : f4E2M1FN to i4
160
+ // CHECK: %[[PAD:.+]] = arith.extui %[[BC]] : i4 to i8
161
+ // CHECK: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
162
+ // CHECK: %[[VEC:.+]] = vector.transfer_read %[[ALLOC]][%[[INDEX]]], %[[PAD]] : memref<12xi8>, vector<4xi8>
163
+ // CHECK: %[[VEC_F4:.+]] = vector.bitcast %[[VEC]] : vector<4xi8> to vector<8xf4E2M1FN>
164
+
165
+ // CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
166
+ // CHECK32: func @vector_transfer_read_f4
167
+ // CHECK32-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
168
+ // CHECK32: %[[CONST:.+]] = arith.constant 0.{{0+}}e+00 : f4E2M1FN
169
+ // CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
170
+ // CHECK32: %[[BC:.+]] = arith.bitcast %[[CONST]] : f4E2M1FN to i4
171
+ // CHECK32: %[[PAD:.+]] = arith.extui %[[BC]] : i4 to i32
172
+ // CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
173
+ // CHECK32: %[[VEC:.+]] = vector.transfer_read %[[ALLOC]][%[[INDEX]]], %[[PAD]] : memref<3xi32>, vector<1xi32>
174
+ // CHECK32: %[[VEC_F4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<8xf4E2M1FN>
175
+
176
+ // -----
177
+
122
178
///----------------------------------------------------------------------------------------
123
179
/// vector.maskedload
124
180
///----------------------------------------------------------------------------------------
@@ -439,6 +495,28 @@ func.func @vector_store_i4(%arg0: vector<8xi4>, %arg1: index, %arg2: index) {
439
495
440
496
// -----
441
497
498
+ func.func @vector_store_f4 (%arg0: vector <8 xf4 E2 M1 FN>, %arg1: index , %arg2: index ) {
499
+ %0 = memref.alloc () : memref <4 x8 xf4 E2 M1 FN>
500
+ vector.store %arg0 , %0 [%arg1 , %arg2 ] :memref <4 x8 xf4 E2 M1 FN>, vector <8 xf4 E2 M1 FN>
501
+ return
502
+ }
503
+
504
+ // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
505
+ // CHECK: func @vector_store_f4
506
+ // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<16xi8>
507
+ // CHECK: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
508
+ // CHECK: %[[VEC_I8:.+]] = vector.bitcast %[[ARG0]] : vector<8xf4E2M1FN> to vector<4xi8>
509
+ // CHECK: vector.store %[[VEC_I8:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<16xi8>, vector<4xi8>
510
+
511
+ // CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
512
+ // CHECK32: func @vector_store_f4
513
+ // CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<4xi32>
514
+ // CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
515
+ // CHECK32: %[[VEC_I32:.+]] = vector.bitcast %[[ARG0]] : vector<8xf4E2M1FN> to vector<1xi32>
516
+ // CHECK32: vector.store %[[VEC_I32:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<4xi32>, vector<1xi32>
517
+
518
+ // -----
519
+
442
520
// FIXME: This example assumes that the store happens at a byte boundary, but
443
521
// that's not guaranteed. Below is a counter-example with specific dimensions:
444
522
// vector.store %arg0, %0[0, 3] : memref<2x13xi4>, vector<8xi4>
0 commit comments