|
| 1 | +/* |
| 2 | +Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. |
| 3 | +Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4 | +of this software and associated documentation files (the "Software"), to deal |
| 5 | +in the Software without restriction, including without limitation the rights |
| 6 | +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 7 | +copies of the Software, and to permit persons to whom the Software is |
| 8 | +furnished to do so, subject to the following conditions: |
| 9 | +
|
| 10 | +The above copyright notice and this permission notice shall be included in |
| 11 | +all copies or substantial portions of the Software. |
| 12 | +
|
| 13 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 14 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 15 | +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 16 | +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 17 | +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 18 | +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 19 | +THE SOFTWARE. |
| 20 | +*/ |
| 21 | + |
| 22 | +#include <hip/hip_runtime.h> |
| 23 | +#include <iostream> |
| 24 | +#include <vector> |
| 25 | +#include <random> |
| 26 | +#include "helper.hpp" |
| 27 | + |
| 28 | +/* |
| 29 | +This example code uses the mfma intrinsic __builtin_amdgcn_mfma_i32_16x16x16i8 to |
| 30 | +compute a 16x16x16 matrix multiplication. |
| 31 | +
|
| 32 | +Input: |
| 33 | + A : 16 x 16 int8s (a 16x16 matrix) |
| 34 | + B : 16 x 16 int8s (a 16x16 matrix) |
| 35 | +
|
| 36 | +Output: |
| 37 | + D : 16 x 16 int32s (a 16x16 matrix) |
| 38 | +*/ |
| 39 | + |
| 40 | +constexpr int M = 16; |
| 41 | +constexpr int N = 16; |
| 42 | +constexpr int K = 16; |
| 43 | + |
| 44 | +constexpr int LDA = K; |
| 45 | +constexpr int LDB = N; |
| 46 | +constexpr int LDD = N; |
| 47 | + |
| 48 | +constexpr int A_size = M * LDA; |
| 49 | +constexpr int B_size = K * LDB; |
| 50 | +constexpr int D_size = M * LDD; |
| 51 | + |
| 52 | + |
| 53 | +__global__ void igemm_16x16x16(const int8_t* A, const int8_t* B, int32_t* D) |
| 54 | +{ |
| 55 | + |
| 56 | +#if __gfx90a__ || __gfx908__ |
| 57 | + using int32x4 = __attribute__((__vector_size__(4 * sizeof(int)))) int; |
| 58 | + // This kernel computes a 16x16x16 matrix multiplication using a single wavefront. |
| 59 | + int32x4 d = {0}; // zero out 4 vanilla VGPRs |
| 60 | + |
| 61 | + /* |
| 62 | + One invocation of v_mfma_i32_16x16x16i8 accumulates the sum of 16 outer products, |
| 63 | + 16 columns of A with 16 rows of B, into result matrix D (which is in AccVGPRs). |
| 64 | + So we need only one iteration to compute the full matrix D |
| 65 | +
|
| 66 | + For both the 16 columns of A, and the 16 rows of B, we use a single VGPR. |
| 67 | + With 64 lanes, and 4 int8 values per lane, that covers the 16 columns of A and 16 |
| 68 | + rows of B. |
| 69 | + Matrix A is a 16 x 16 matrix that is stored in 1 VGPR as follows: |
| 70 | + a[0] covers columns 0, 4, 8, and 12 |
| 71 | + a[1] covers columns 1, 5, 9, and 13 |
| 72 | + a[2] covers columns 2, 6, 10, and 14 |
| 73 | + a[3] covers columns 3, 7, 11, and 15 |
| 74 | + first 16 lanes of a[0] cover column 0 - last 16 lanes of a[0] cover column 12 |
| 75 | + first 16 lanes of a[1] cover column 1 - last 16 lanes of a[1] cover column 13 |
| 76 | + first 16 lanes of a[2] cover column 2 - last 16 lanes of a[2] cover column 14 |
| 77 | + first 16 lanes of a[3] cover column 3 - last 16 lanes of a[3] cover column 15 |
| 78 | + Matrix B is a 16 x 16 matrix that is stored in 1 VGPR as follows: |
| 79 | + b[0] covers rows 0, 4, 8, and 12 |
| 80 | + b[1] covers rows 1, 5, 9, and 13 |
| 81 | + b[2] covers rows 2, 6, 10, and 14 |
| 82 | + b[3] covers rows 3, 7, 11, and 15 |
| 83 | + first 16 lanes of b[0] cover row 0 - last 16 lanes of b[0] cover row 12 |
| 84 | + first 16 lanes of b[1] cover row 1 - last 16 lanes of b[1] cover row 13 |
| 85 | + first 16 lanes of b[2] cover row 2 - last 16 lanes of b[2] cover row 14 |
| 86 | + first 16 lanes of b[3] cover row 3 - last 16 lanes of b[3] cover row 15 |
| 87 | + Note that A and B are in row-major order. |
| 88 | +
|
| 89 | + This kernel is called with a single wavefront in dim3(16, 4) layout |
| 90 | + */ |
| 91 | + |
| 92 | + int8_t a[4]; |
| 93 | + int8_t b[4]; |
| 94 | + for(int i = 0; i < 4; ++i){ |
| 95 | + const int a_idx = threadIdx.x * LDA // consecutive threads cover 16 consecutive rows |
| 96 | + + i // consecutive registers take consecutive columns |
| 97 | + + threadIdx.y * 4; // groups of 16 lanes skip 4 columns |
| 98 | + a[i] = A[a_idx]; |
| 99 | + |
| 100 | + const int b_idx = threadIdx.x // consecutive threads cover 16 consecutive columns |
| 101 | + + i * LDB // consecutive registers take consecutive rows |
| 102 | + + threadIdx.y * LDB * 4; // groups of 16 lanes skip 4 rows |
| 103 | + b[i] = B[b_idx]; |
| 104 | + } |
| 105 | + |
| 106 | + d = __builtin_amdgcn_mfma_i32_16x16x16i8(*reinterpret_cast<int32_t*>(a), *reinterpret_cast<int32_t*>(b), d, 0, 0, 0); |
| 107 | + // ^ ^ ^ |
| 108 | + //D(=C) | | C(=D) |
| 109 | + // 16 columns of A---| |--- 16 rows of B |
| 110 | + |
| 111 | + /* |
| 112 | + Matrix D is a 16 x 16 matrix that is stored in 4 AccVGPRs as follows: |
| 113 | + d[0] covers rows 0, 4, 8, and 12 |
| 114 | + d[1] covers rows 1, 5, 9, and 13 |
| 115 | + d[2] covers rows 2, 6, 10, and 14 |
| 116 | + d[3] covers rows 3, 7, 11, and 15 |
| 117 | + first 16 lanes of d[0] cover row 0 - last 16 lanes of d[0] cover row 12 |
| 118 | + first 16 lanes of d[1] cover row 1 - last 16 lanes of d[1] cover row 13 |
| 119 | + first 16 lanes of d[2] cover row 2 - last 16 lanes of d[2] cover row 14 |
| 120 | + first 16 lanes of d[3] cover row 3 - last 16 lanes of d[3] cover row 15 |
| 121 | + */ |
| 122 | + for(int i = 0; i < 4; ++i){ |
| 123 | + const int d_idx = threadIdx.x // consecutive threads cover 16 consecutive columns |
| 124 | + + i * LDD // consecutive registers take consecutive rows of 16 floats |
| 125 | + + threadIdx.y * 4 * LDD; // groups of 16 lanes skip 4 rows |
| 126 | + |
| 127 | + D[d_idx] = d[i]; |
| 128 | + } |
| 129 | +#endif |
| 130 | +} |
| 131 | + |
| 132 | + |
| 133 | +int main(){ |
| 134 | + if (!gpuArchCheck("gfx90a") && !gpuArchCheck("gfx908")) { |
| 135 | + std::cout << "mfma_f32_16x16x16f16 instruction only available on gfx908 or later." |
| 136 | + << std::endl; |
| 137 | + exit(-1); |
| 138 | + } |
| 139 | + |
| 140 | + std::mt19937 gen(0); |
| 141 | + std::uniform_int_distribution<int8_t> dist(-100, 100); |
| 142 | + |
| 143 | + // Make and populate some host matrices |
| 144 | + std::vector<int8_t> A_h(A_size); |
| 145 | + for(int i = 0; i < A_h.size(); ++i){ |
| 146 | + A_h[i] = static_cast<int8_t>(dist(gen)); |
| 147 | + } |
| 148 | + std::vector<int8_t> B_h(B_size); |
| 149 | + for(int i = 0; i < B_h.size(); ++i){ |
| 150 | + B_h[i] = static_cast<int8_t>(dist(gen)); |
| 151 | + } |
| 152 | + |
| 153 | + // Calculate reference D on host |
| 154 | + std::vector<int32_t> Dref_h(D_size); |
| 155 | + gemm_host(A_h, B_h, Dref_h, M, N, K, LDA, LDB, LDD); |
| 156 | + |
| 157 | + // Make and populate device buffers |
| 158 | + int8_t *A_d, *B_d; |
| 159 | + int32_t *D_d; |
| 160 | + HIP_CHECK(hipMalloc(&A_d, A_size * sizeof(int8_t))); |
| 161 | + HIP_CHECK(hipMalloc(&B_d, B_size * sizeof(int8_t))); |
| 162 | + HIP_CHECK(hipMalloc(&D_d, D_size * sizeof(int32_t))); |
| 163 | + HIP_CHECK(hipMemcpy(A_d, A_h.data(), A_size * sizeof(int8_t), hipMemcpyHostToDevice)); |
| 164 | + HIP_CHECK(hipMemcpy(B_d, B_h.data(), B_size * sizeof(int8_t), hipMemcpyHostToDevice)); |
| 165 | + |
| 166 | + // Launch GEMM kernel |
| 167 | + igemm_16x16x16<<<1, dim3(16, 4)>>>(A_d, B_d, D_d); |
| 168 | + HIP_CHECK(hipGetLastError()); |
| 169 | + |
| 170 | + // Copy result back to host |
| 171 | + std::vector<int32_t> D_h(D_size); |
| 172 | + HIP_CHECK(hipMemcpy(D_h.data(), D_d, D_size * sizeof(int32_t), hipMemcpyDeviceToHost)); |
| 173 | + |
| 174 | + std::cout << "Sum of squared differences of host/device result matrices: " |
| 175 | + << compute_l2_error(Dref_h, D_h, M, N, LDD, LDD) |
| 176 | + << std::endl; |
| 177 | + |
| 178 | + HIP_CHECK(hipFree(D_d)); |
| 179 | + HIP_CHECK(hipFree(B_d)); |
| 180 | + HIP_CHECK(hipFree(A_d)); |
| 181 | + return 0; |
| 182 | +} |
0 commit comments