Skip to content

Commit 2543939

Browse files
committedMay 29, 2019
opt ins and add omp
1 parent 2e6810f commit 2543939

File tree

2 files changed

+75
-46
lines changed

2 files changed

+75
-46
lines changed
 

‎intel/README

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ This is a beta version.
33
It is for Intel AVX512.
44

55
icc mm.c -lmkl_rt -o mm
6+
icc mm.c -qopenmp -lmkl_rt -o mm
67

78
./mm
89

‎intel/mm.c

+74-46
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44

55
#include <mkl.h>
66

7-
#define MM 4000
8-
#define KK 4000
9-
#define NN 4000
7+
/*
8+
#define MM 4096
9+
#define KK 4096
10+
#define NN 4096
1011
1112
#define BLOCK_M 64
1213
#define BLOCK_K 64
@@ -17,6 +18,21 @@
1718
#define SIMD_BLOCK_M BLOCK_M/SIMD_SIZE
1819
#define SIMD_BLOCK_K BLOCK_K/SIMD_SIZE
1920
#define SIMD_BLOCK_N BLOCK_N/SIMD_SIZE
21+
*/
22+
23+
const unsigned long long MM=4096;
24+
const unsigned long long KK=4096;
25+
const unsigned long long NN=4096;
26+
27+
const unsigned long long BLOCK_M=64;
28+
const unsigned long long BLOCK_K=64;
29+
const unsigned long long BLOCK_N=64;
30+
31+
const unsigned long long SIMD_SIZE=16;
32+
33+
const unsigned long long SIMD_BLOCK_M=BLOCK_M/SIMD_SIZE;
34+
const unsigned long long SIMD_BLOCK_K=BLOCK_K/SIMD_SIZE;
35+
const unsigned long long SIMD_BLOCK_N=BLOCK_N/SIMD_SIZE;
2036

2137
// L1 (32K) should be enough for a 8K(16*512)
2238

@@ -26,11 +42,11 @@ void aligned_MM(float *A, float *B, float *C, int M, int K, int N) {
2642

2743
for(I=0; I<M; I++)
2844
for(J=0; J<K; J+=SIMD_SIZE)
29-
for(L=0; L<M; L+=SIMD_SIZE) {
45+
for(L=0; L<N; L+=SIMD_SIZE) {
3046
__m512 a;
3147
__m512 b, c;
3248
float *A_ptr = A + I*K + J;
33-
float *B_ptr = B + J*N + K;
49+
float *B_ptr = B + J*N + L;
3450
float *C_ptr = C + I*N + L;
3551

3652
c = _mm512_load_ps(C_ptr);
@@ -43,30 +59,50 @@ void aligned_MM(float *A, float *B, float *C, int M, int K, int N) {
4359
}
4460
}
4561

46-
void Block_C_SIMD(const float *A, const float *B, float *C, const int m, const int n, const int M, const int K, const int N) {
62+
void Block_AB(float *A, float *B, float *C, const int m, const int n, const int k, int M, int K, int N) {
63+
int i, j, l;
64+
float a[BLOCK_M*BLOCK_K], b[BLOCK_N*BLOCK_K];
65+
for(i=0; i<k; i++)
66+
for(l=0; l<m; l++)
67+
a[i] = A[l*K+i];
68+
for(l=0; l<k; l++)
69+
for(i=0; i<n; i++)
70+
b[i] = B[l*N+i];
71+
72+
}
73+
74+
void Block_C_SIMD(float *A, float *B, float *C, const int m, const int n, const int M, const int K, const int N) {
4775
int i, j, l;
4876
int align_K = BLOCK_K*(K/BLOCK_K);
4977

5078
__m512 b[SIMD_BLOCK_N];
5179
__m512 c[BLOCK_M*SIMD_BLOCK_N] = {0.};
5280

5381
for(l=0; l<K; l++) {
54-
for(j=0; j<SIMD_BLOCK_M; j++)
55-
b[j] = _mm512_load_ps(&B[l*N+j*SIMD_SIZE]);
82+
float *a_ptr = &A[l];
83+
float *b_ptr = &B[l*N];
84+
for(j=0; j<SIMD_BLOCK_N; j++, b_ptr+=SIMD_SIZE)
85+
b[j] = _mm512_load_ps(b_ptr);
5686

87+
__m512 *c_ptr = &c[0];
5788
for(i=0; i<m; i++) {
58-
__m512 a = _mm512_set1_ps(A[i*K+l]);
59-
for(j=0; j<SIMD_BLOCK_M; j++) {
60-
c[i*SIMD_BLOCK_N+j] += a*b[j];
89+
__m512 a = _mm512_set1_ps(*a_ptr);
90+
for(j=0; j<SIMD_BLOCK_N; j++) {
91+
*c_ptr += a*b[j];
92+
c_ptr++;
6193
}
94+
a_ptr += K;
6295
}
6396
}
6497

65-
for(i=0; i<m; i++)
66-
for(j=0; j<SIMD_BLOCK_M; j++) {
67-
__m512 cc = _mm512_load_ps(&C[i*M+j*SIMD_SIZE]);
68-
cc += c[i*SIMD_BLOCK_N+j];
69-
_mm512_store_ps(&C[i*M+j*SIMD_SIZE],cc);
98+
for(i=0; i<m; i++) {
99+
float *c_ptr = &C[i*M];
100+
for(j=0; j<SIMD_BLOCK_N; j++) {
101+
__m512 cc = _mm512_load_ps(c_ptr);
102+
cc += c[i*SIMD_BLOCK_N+j];
103+
_mm512_store_ps(c_ptr,cc);
104+
c_ptr+=SIMD_SIZE;
105+
}
70106
}
71107
}
72108

@@ -82,18 +118,6 @@ void MatMul(float *A, float *B, float *C, int M, int K, int N) {
82118
}
83119
}
84120

85-
void Block_AB(float *A, float *B, float *C, const int m, const int n, const int k, int M, int K, int N) {
86-
int i, j, l;
87-
float a[BLOCK_M*BLOCK_K], b[BLOCK_N*BLOCK_K];
88-
for(i=0; i<k; i++)
89-
for(l=0; l<m; l++)
90-
a[i] = A[l*K+i];
91-
for(l=0; l<k; l++)
92-
for(i=0; i<n; i++)
93-
b[i] = B[l*N+i];
94-
95-
}
96-
97121
void Block_C(float *A, float *B, float *C, const int m, const int n, int M, int K, int N) {
98122
int i, j, l;
99123
int align_K = BLOCK_K*(K/BLOCK_K);
@@ -136,6 +160,7 @@ void MatMul_block_ins(float *A, float *B, float *C, int M, int K, int N) {
136160
int align_M = BLOCK_M*(M/BLOCK_M);
137161
int align_N = BLOCK_N*(N/BLOCK_N);
138162

163+
#pragma omp parallel for
139164
for(I=0; I<align_M; I+=BLOCK_M)
140165
for(L=0; L<align_N; L+=BLOCK_N)
141166
Block_C_SIMD(&A[I*K], &B[L], &C[I*N+L], BLOCK_M, BLOCK_N, M, K, N);
@@ -159,45 +184,48 @@ int main() {
159184
float *A, *B, *C;
160185

161186
struct timeval begin, end;
162-
int timeuse;
187+
float timeuse;
163188

164189
A = (float*)_mm_malloc(sizeof(float)*MM*KK, 64);
165190
B = (float*)_mm_malloc(sizeof(float)*KK*NN, 64);
166191
C = (float*)_mm_malloc(sizeof(float)*MM*NN, 64);
167192

168-
for (i=0; i<MM*KK; i++) A[i] = 1.;
169-
for (i=0; i<KK*NN; i++) B[i] = 2.;
193+
for (i=0; i<MM*KK; i++) A[i] = i;
194+
for (i=0; i<KK*NN; i++) B[i] = 2.*i;
170195
for (i=0; i<MM*NN; i++) C[i] = 0.;
171196

172-
gettimeofday( &begin, NULL );
173197
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MM, NN, KK, 1., A, KK, B, NN, 1., C, NN);
198+
gettimeofday( &begin, NULL );
199+
for(int f=0;f<5;f++)
200+
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MM, NN, KK, 1., A, KK, B, NN, 1., C, NN);
174201
gettimeofday( &end, NULL );
175-
timeuse = 1000000 * ( end.tv_sec - begin.tv_sec ) + end.tv_usec - begin.tv_usec;
176-
printf("mkl time: %d us\n", timeuse);
202+
timeuse = (1000000. * ( end.tv_sec - begin.tv_sec ) + end.tv_usec - begin.tv_usec)/1000.;
203+
printf("mkl time: %.2f ms\n", timeuse/5);
177204
/*
178205
gettimeofday( &begin, NULL );
179206
MatMul(A, B, C, MM, KK, NN);
180207
gettimeofday( &end, NULL );
181208
timeuse = 1000000 * ( end.tv_sec - begin.tv_sec ) + end.tv_usec - begin.tv_usec;
182209
printf("org time: %d us\n", timeuse);
183-
*/
210+
184211
gettimeofday( &begin, NULL );
185-
MatMul_block(A, B, C, MM, KK, NN);
212+
MatMul_ins(A, B, C, MM, KK, NN);
186213
gettimeofday( &end, NULL );
187-
timeuse = 1000000 * ( end.tv_sec - begin.tv_sec ) + end.tv_usec - begin.tv_usec;
188-
printf("opt time: %d us\n", timeuse);
214+
timeuse = (1000000 * ( end.tv_sec - begin.tv_sec ) + end.tv_usec - begin.tv_usec)/1000.;
215+
printf("ins time: %.2f ms\n", timeuse);
216+
189217
gettimeofday( &begin, NULL );
190-
MatMul_block_ins(A, B, C, MM, KK, NN);
218+
MatMul_block(A, B, C, MM, KK, NN);
191219
gettimeofday( &end, NULL );
192-
timeuse = 1000000 * ( end.tv_sec - begin.tv_sec ) + end.tv_usec - begin.tv_usec;
193-
printf("opt time: %d us\n", timeuse);
194-
/*
220+
timeuse = (1000000. * ( end.tv_sec - begin.tv_sec ) + end.tv_usec - begin.tv_usec)/1000.;
221+
printf("block time: %.2f ms\n", timeuse);
222+
*/
195223
gettimeofday( &begin, NULL );
196-
MatMul_ins(A, B, C, MM, KK, NN);
224+
MatMul_block_ins(A, B, C, MM, KK, NN);
197225
gettimeofday( &end, NULL );
198-
timeuse = 1000000 * ( end.tv_sec - begin.tv_sec ) + end.tv_usec - begin.tv_usec;
199-
printf("ins time: %d us\n", timeuse);
200-
*/
226+
timeuse = (1000000. * ( end.tv_sec - begin.tv_sec ) + end.tv_usec - begin.tv_usec)/1000.;
227+
printf("block ins time: %.2f ms\n", timeuse);
228+
201229
_mm_free(A);
202230
_mm_free(B);
203231
_mm_free(C);

0 commit comments

Comments
 (0)
Please sign in to comment.