Skip to content

Commit ea217cd

Browse files
gsitaramjychang48
authored andcommitted
Add mfma example for operating on INT8 values
1 parent 25a8e7b commit ea217cd

File tree

1 file changed

+182
-0
lines changed

1 file changed

+182
-0
lines changed
+182
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
/*
2+
Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
3+
Permission is hereby granted, free of charge, to any person obtaining a copy
4+
of this software and associated documentation files (the "Software"), to deal
5+
in the Software without restriction, including without limitation the rights
6+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7+
copies of the Software, and to permit persons to whom the Software is
8+
furnished to do so, subject to the following conditions:
9+
10+
The above copyright notice and this permission notice shall be included in
11+
all copies or substantial portions of the Software.
12+
13+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19+
THE SOFTWARE.
20+
*/
21+
22+
#include <hip/hip_runtime.h>
23+
#include <iostream>
24+
#include <vector>
25+
#include <random>
26+
#include "helper.hpp"
27+
28+
/*
29+
This example code uses the mfma intrinsic __builtin_amdgcn_mfma_i32_16x16x16i8 to
30+
compute a 16x16x16 matrix multiplication.
31+
32+
Input:
33+
A : 16 x 16 int8s (a 16x16 matrix)
34+
B : 16 x 16 int8s (a 16x16 matrix)
35+
36+
Output:
37+
D : 16 x 16 int32s (a 16x16 matrix)
38+
*/
39+
40+
constexpr int M = 16;
41+
constexpr int N = 16;
42+
constexpr int K = 16;
43+
44+
constexpr int LDA = K;
45+
constexpr int LDB = N;
46+
constexpr int LDD = N;
47+
48+
constexpr int A_size = M * LDA;
49+
constexpr int B_size = K * LDB;
50+
constexpr int D_size = M * LDD;
51+
52+
53+
__global__ void igemm_16x16x16(const int8_t* A, const int8_t* B, int32_t* D)
54+
{
55+
56+
#if __gfx90a__ || __gfx908__
57+
using int32x4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
58+
// This kernel computes a 16x16x16 matrix multiplication using a single wavefront.
59+
int32x4 d = {0}; // zero out 4 vanilla VGPRs
60+
61+
/*
62+
One invocation of v_mfma_i32_16x16x16i8 accumulates the sum of 16 outer products,
63+
16 columns of A with 16 rows of B, into result matrix D (which is in AccVGPRs).
64+
So we need only one iteration to compute the full matrix D
65+
66+
For both the 16 columns of A, and the 16 rows of B, we use a single VGPR.
67+
With 64 lanes, and 4 int8 values per lane, that covers the 16 columns of A and 16
68+
rows of B.
69+
Matrix A is a 16 x 16 matrix that is stored in 1 VGPR as follows:
70+
a[0] covers columns 0, 4, 8, and 12
71+
a[1] covers columns 1, 5, 9, and 13
72+
a[2] covers columns 2, 6, 10, and 14
73+
a[3] covers columns 3, 7, 11, and 15
74+
first 16 lanes of a[0] cover column 0 - last 16 lanes of a[0] cover column 12
75+
first 16 lanes of a[1] cover column 1 - last 16 lanes of a[1] cover column 13
76+
first 16 lanes of a[2] cover column 2 - last 16 lanes of a[2] cover column 14
77+
first 16 lanes of a[3] cover column 3 - last 16 lanes of a[3] cover column 15
78+
Matrix B is a 16 x 16 matrix that is stored in 1 VGPR as follows:
79+
b[0] covers rows 0, 4, 8, and 12
80+
b[1] covers rows 1, 5, 9, and 13
81+
b[2] covers rows 2, 6, 10, and 14
82+
b[3] covers rows 3, 7, 11, and 15
83+
first 16 lanes of b[0] cover row 0 - last 16 lanes of b[0] cover row 12
84+
first 16 lanes of b[1] cover row 1 - last 16 lanes of b[1] cover row 13
85+
first 16 lanes of b[2] cover row 2 - last 16 lanes of b[2] cover row 14
86+
first 16 lanes of b[3] cover row 3 - last 16 lanes of b[3] cover row 15
87+
Note that A and B are in row-major order.
88+
89+
This kernel is called with a single wavefront in dim3(16, 4) layout
90+
*/
91+
92+
int8_t a[4];
93+
int8_t b[4];
94+
for(int i = 0; i < 4; ++i){
95+
const int a_idx = threadIdx.x * LDA // consecutive threads cover 16 consecutive rows
96+
+ i // consecutive registers take consecutive columns
97+
+ threadIdx.y * 4; // groups of 16 lanes skip 4 columns
98+
a[i] = A[a_idx];
99+
100+
const int b_idx = threadIdx.x // consecutive threads cover 16 consecutive columns
101+
+ i * LDB // consecutive registers take consecutive rows
102+
+ threadIdx.y * LDB * 4; // groups of 16 lanes skip 4 rows
103+
b[i] = B[b_idx];
104+
}
105+
106+
d = __builtin_amdgcn_mfma_i32_16x16x16i8(*reinterpret_cast<int32_t*>(a), *reinterpret_cast<int32_t*>(b), d, 0, 0, 0);
107+
// ^ ^ ^
108+
//D(=C) | | C(=D)
109+
// 16 columns of A---| |--- 16 rows of B
110+
111+
/*
112+
Matrix D is a 16 x 16 matrix that is stored in 4 AccVGPRs as follows:
113+
d[0] covers rows 0, 4, 8, and 12
114+
d[1] covers rows 1, 5, 9, and 13
115+
d[2] covers rows 2, 6, 10, and 14
116+
d[3] covers rows 3, 7, 11, and 15
117+
first 16 lanes of d[0] cover row 0 - last 16 lanes of d[0] cover row 12
118+
first 16 lanes of d[1] cover row 1 - last 16 lanes of d[1] cover row 13
119+
first 16 lanes of d[2] cover row 2 - last 16 lanes of d[2] cover row 14
120+
first 16 lanes of d[3] cover row 3 - last 16 lanes of d[3] cover row 15
121+
*/
122+
for(int i = 0; i < 4; ++i){
123+
const int d_idx = threadIdx.x // consecutive threads cover 16 consecutive columns
124+
+ i * LDD // consecutive registers take consecutive rows of 16 floats
125+
+ threadIdx.y * 4 * LDD; // groups of 16 lanes skip 4 rows
126+
127+
D[d_idx] = d[i];
128+
}
129+
#endif
130+
}
131+
132+
133+
int main(){
134+
if (!gpuArchCheck("gfx90a") && !gpuArchCheck("gfx908")) {
135+
std::cout << "mfma_f32_16x16x16f16 instruction only available on gfx908 or later."
136+
<< std::endl;
137+
exit(-1);
138+
}
139+
140+
std::mt19937 gen(0);
141+
std::uniform_int_distribution<int8_t> dist(-100, 100);
142+
143+
// Make and populate some host matrices
144+
std::vector<int8_t> A_h(A_size);
145+
for(int i = 0; i < A_h.size(); ++i){
146+
A_h[i] = static_cast<int8_t>(dist(gen));
147+
}
148+
std::vector<int8_t> B_h(B_size);
149+
for(int i = 0; i < B_h.size(); ++i){
150+
B_h[i] = static_cast<int8_t>(dist(gen));
151+
}
152+
153+
// Calculate reference D on host
154+
std::vector<int32_t> Dref_h(D_size);
155+
gemm_host(A_h, B_h, Dref_h, M, N, K, LDA, LDB, LDD);
156+
157+
// Make and populate device buffers
158+
int8_t *A_d, *B_d;
159+
int32_t *D_d;
160+
HIP_CHECK(hipMalloc(&A_d, A_size * sizeof(int8_t)));
161+
HIP_CHECK(hipMalloc(&B_d, B_size * sizeof(int8_t)));
162+
HIP_CHECK(hipMalloc(&D_d, D_size * sizeof(int32_t)));
163+
HIP_CHECK(hipMemcpy(A_d, A_h.data(), A_size * sizeof(int8_t), hipMemcpyHostToDevice));
164+
HIP_CHECK(hipMemcpy(B_d, B_h.data(), B_size * sizeof(int8_t), hipMemcpyHostToDevice));
165+
166+
// Launch GEMM kernel
167+
igemm_16x16x16<<<1, dim3(16, 4)>>>(A_d, B_d, D_d);
168+
HIP_CHECK(hipGetLastError());
169+
170+
// Copy result back to host
171+
std::vector<int32_t> D_h(D_size);
172+
HIP_CHECK(hipMemcpy(D_h.data(), D_d, D_size * sizeof(int32_t), hipMemcpyDeviceToHost));
173+
174+
std::cout << "Sum of squared differences of host/device result matrices: "
175+
<< compute_l2_error(Dref_h, D_h, M, N, LDD, LDD)
176+
<< std::endl;
177+
178+
HIP_CHECK(hipFree(D_d));
179+
HIP_CHECK(hipFree(B_d));
180+
HIP_CHECK(hipFree(A_d));
181+
return 0;
182+
}

0 commit comments

Comments
 (0)