Skip to content

Commit 41d77be

Browse files
committed
ggml-cpu: add repack GEMM and GEMV for floating-point
1 parent 5b0894f commit 41d77be

4 files changed

Lines changed: 39 additions & 35 deletions

File tree

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@
7373
// repack.cpp
7474
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
7575
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
76+
#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1
77+
#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1
7678
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
7779
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
7880
#define ggml_gemv_f16_1x16_f16_generic ggml_gemv_f16_1x16_f16

ggml/src/ggml-cpu/arch/riscv/repack.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,8 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
343343

344344
template<int ncols_interleaved>
345345
static inline void ggml_gemv_f16_1xM_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
346+
GGML_UNUSED(bs);
347+
346348
const int nb = n / 1;
347349

348350
assert (nr == 1);
@@ -402,6 +404,8 @@ void ggml_gemv_f16_1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const vo
402404

403405
template<int ncols_interleaved>
404406
static inline void ggml_gemv_f32_1xM_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
407+
GGML_UNUSED(bs);
408+
405409
const int nb = n / 1;
406410

407411
assert (nr == 1);

ggml/src/ggml-cpu/repack.cpp

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ static inline int nearest_int(float fval) {
3131
return (i & 0x007fffff) - 0x00400000;
3232
}
3333

34-
// Helper template functions for `fp16` and `fp32`.
34+
// Helper functions for `fp16` and `fp32`.
3535

3636
template<int nrows_interleaved, int interleave_size>
3737
static inline void ggml_repack_mat_f16_NxK_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
@@ -333,7 +333,7 @@ static inline void ggml_gemv_f16_KxM_f16_generic(int n, float * GGML_RESTRICT s,
333333
for (int l = 0; l < nb; l++) {
334334
for (int j = 0; j < ncols_interleaved; j++) {
335335
for (int k = 0; k < interleave_size; k++) {
336-
sumf[j] += GGML_FP16_TO_FP32(b_ptr[l].d[j * interleave_size + k]) * GGML_FP16_TO_FP32(a_ptr[l + k]);
336+
sumf[j] += GGML_FP16_TO_FP32(b_ptr[l].d[j * interleave_size + k]) * GGML_FP16_TO_FP32(a_ptr[l * interleave_size + k]);
337337
}
338338
}
339339
}
@@ -363,7 +363,7 @@ static inline void ggml_gemv_f32_KxM_f32_generic(int n, float * GGML_RESTRICT s,
363363
for (int l = 0; l < nb; l++) {
364364
for (int j = 0; j < ncols_interleaved; j++) {
365365
for (int k = 0; k < interleave_size; k++) {
366-
sumf[j] += b_ptr[l].d[j * interleave_size + k] * a_ptr[l + k];
366+
sumf[j] += b_ptr[l].d[j * interleave_size + k] * a_ptr[l * interleave_size + k];
367367
}
368368
}
369369
}
@@ -375,7 +375,7 @@ template<int nrows, int interleave_size, int ncols_interleaved>
375375
static inline void ggml_gemm_f16_NxKxM_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
376376
const int nb = n / interleave_size;
377377

378-
assert (nr % nrows == 0);
378+
assert(nr % nrows == 0);
379379
assert(n % interleave_size == 0);
380380
assert(nc % ncols_interleaved == 0);
381381

@@ -395,7 +395,7 @@ static inline void ggml_gemm_f16_NxKxM_f16_generic(int n, float * GGML_RESTRICT
395395
for (int m = 0; m < nrows; m++) {
396396
for (int j = 0; j < ncols_interleaved; j++) {
397397
for (int k = 0; k < interleave_size; k++) {
398-
sumf[m][j] += b_ptr[l].d[j * interleave_size + k] * a_ptr[l].d[m * interleave_size + k];
398+
sumf[m][j] += GGML_FP16_TO_FP32(b_ptr[l].d[j * interleave_size + k]) * GGML_FP16_TO_FP32(a_ptr[l].d[m * interleave_size + k]);
399399
}
400400
}
401401
}
@@ -412,7 +412,7 @@ template<int nrows, int interleave_size, int ncols_interleaved>
412412
static inline void ggml_gemm_f32_NxKxM_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
413413
const int nb = n / interleave_size;
414414

415-
assert (nr % nrows == 0);
415+
assert(nr % nrows == 0);
416416
assert(n % interleave_size == 0);
417417
assert(nc % ncols_interleaved == 0);
418418

@@ -1913,34 +1913,34 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
19131913
GGML_UNUSED(data_size);
19141914
}
19151915

1916-
template<int nrows_interleaved, int interleave_size>
1917-
static int repack_f16_to_f16_N_bl(struct ggml_tensor * t, const void * GGML_RESTRICT data, size_t data_size) {
1916+
template<int ncols_interleaved, int interleave_size>
1917+
static int repack_f16_to_f16_MxK_bl(struct ggml_tensor * t, const void * GGML_RESTRICT data, size_t data_size) {
19181918
GGML_ASSERT(t->type == GGML_TYPE_F16);
19191919

19201920
const ggml_half * src = (const ggml_half *)data;
1921-
block_f16<nrows_interleaved, interleave_size> * dst = ( block_f16<nrows_interleaved, interleave_size> *)t->data;
1921+
block_f16<ncols_interleaved, interleave_size> * dst = ( block_f16<ncols_interleaved, interleave_size> *)t->data;
19221922

1923-
ggml_half dst_tmp[nrows_interleaved * interleave_size];
1923+
ggml_half dst_tmp[ncols_interleaved * interleave_size];
19241924

19251925
int nrow = ggml_nrows(t);
19261926
int row_size = t->ne[0];
19271927
int nblocks = row_size / interleave_size;
19281928

19291929
GGML_ASSERT(data_size == nrow * nblocks * interleave_size * sizeof(ggml_half));
19301930

1931-
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % interleave_size != 0) {
1931+
if (t->ne[1] % ncols_interleaved != 0 || t->ne[0] % interleave_size != 0) {
19321932
return -1;
19331933
}
19341934

1935-
for (int b = 0; b < nrow; b += nrows_interleaved) {
1935+
for (int b = 0; b < nrow; b += ncols_interleaved) {
19361936
for (int i = 0; i < nblocks; i++) {
1937-
for (int j = 0; j < nrows_interleaved; j++) {
1937+
for (int j = 0; j < ncols_interleaved; j++) {
19381938
for (int k = 0; k < interleave_size; k++) {
19391939
dst_tmp[j * interleave_size + k] = src[(j + b) * row_size + i * interleave_size + k];
19401940
}
19411941
}
1942-
block_f16<nrows_interleaved, interleave_size> out;
1943-
memcpy(&out.d, dst_tmp, sizeof(ggml_half) * nrows_interleaved * interleave_size);
1942+
block_f16<ncols_interleaved, interleave_size> out;
1943+
memcpy(&out.d, dst_tmp, sizeof(ggml_half) * ncols_interleaved * interleave_size);
19441944
*dst = out;
19451945
dst++;
19461946
}
@@ -1949,34 +1949,34 @@ static int repack_f16_to_f16_N_bl(struct ggml_tensor * t, const void * GGML_REST
19491949
return 0;
19501950
}
19511951

1952-
template<int nrows_interleaved, int interleave_size>
1953-
static int repack_f32_to_f32_N_bl(struct ggml_tensor * t, const void * GGML_RESTRICT data, size_t data_size) {
1952+
template<int ncols_interleaved, int interleave_size>
1953+
static int repack_f32_to_f32_MxK_bl(struct ggml_tensor * t, const void * GGML_RESTRICT data, size_t data_size) {
19541954
GGML_ASSERT(t->type == GGML_TYPE_F32);
19551955

19561956
const float * src = (const float *)data;
1957-
block_f32<nrows_interleaved, interleave_size> * dst = ( block_f32<nrows_interleaved, interleave_size> *)t->data;
1957+
block_f32<ncols_interleaved, interleave_size> * dst = ( block_f32<ncols_interleaved, interleave_size> *)t->data;
19581958

1959-
float dst_tmp[nrows_interleaved * interleave_size];
1959+
float dst_tmp[ncols_interleaved * interleave_size];
19601960

19611961
int nrow = ggml_nrows(t);
19621962
int row_size = t->ne[0];
19631963
int nblocks = row_size / interleave_size;
19641964

19651965
GGML_ASSERT(data_size == nrow * nblocks * interleave_size * sizeof(float));
19661966

1967-
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % interleave_size != 0) {
1967+
if (t->ne[1] % ncols_interleaved != 0 || t->ne[0] % interleave_size != 0) {
19681968
return -1;
19691969
}
19701970

1971-
for (int b = 0; b < nrow; b += nrows_interleaved) {
1971+
for (int b = 0; b < nrow; b += ncols_interleaved) {
19721972
for (int i = 0; i < nblocks; i++) {
1973-
for (int j = 0; j < nrows_interleaved; j++) {
1973+
for (int j = 0; j < ncols_interleaved; j++) {
19741974
for (int k = 0; k < interleave_size; k++) {
19751975
dst_tmp[j * interleave_size + k] = src[(j + b) * row_size + i * interleave_size + k];
19761976
}
19771977
}
1978-
block_f32<nrows_interleaved, interleave_size> out;
1979-
memcpy(&out.d, dst_tmp, sizeof(float) * nrows_interleaved * interleave_size);
1978+
block_f32<ncols_interleaved, interleave_size> out;
1979+
memcpy(&out.d, dst_tmp, sizeof(float) * ncols_interleaved * interleave_size);
19801980
*dst = out;
19811981
dst++;
19821982
}
@@ -2029,29 +2029,29 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
20292029
}
20302030

20312031
template <> int repack<ggml_half, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
2032-
return repack_f16_to_f16_N_bl<16, 1>(t, data, data_size);
2032+
return repack_f16_to_f16_MxK_bl<16, 1>(t, data, data_size);
20332033
}
20342034
template <> int repack<ggml_half, 1, 32>(struct ggml_tensor * t, const void * data, size_t data_size) {
2035-
return repack_f16_to_f16_N_bl<32, 1>(t, data, data_size);
2035+
return repack_f16_to_f16_MxK_bl<32, 1>(t, data, data_size);
20362036
}
20372037
template <> int repack<ggml_half, 1, 64>(struct ggml_tensor * t, const void * data, size_t data_size) {
2038-
return repack_f16_to_f16_N_bl<64, 1>(t, data, data_size);
2038+
return repack_f16_to_f16_MxK_bl<64, 1>(t, data, data_size);
20392039
}
20402040
template <> int repack<ggml_half, 1, 128>(struct ggml_tensor * t, const void * data, size_t data_size) {
2041-
return repack_f16_to_f16_N_bl<128, 1>(t, data, data_size);
2041+
return repack_f16_to_f16_MxK_bl<128, 1>(t, data, data_size);
20422042
}
20432043

20442044
template <> int repack<float, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
2045-
return repack_f32_to_f32_N_bl<16, 1>(t, data, data_size);
2045+
return repack_f32_to_f32_MxK_bl<16, 1>(t, data, data_size);
20462046
}
20472047
template <> int repack<float, 1, 32>(struct ggml_tensor * t, const void * data, size_t data_size) {
2048-
return repack_f32_to_f32_N_bl<32, 1>(t, data, data_size);
2048+
return repack_f32_to_f32_MxK_bl<32, 1>(t, data, data_size);
20492049
}
20502050
template <> int repack<float, 1, 64>(struct ggml_tensor * t, const void * data, size_t data_size) {
2051-
return repack_f32_to_f32_N_bl<64, 1>(t, data, data_size);
2051+
return repack_f32_to_f32_MxK_bl<64, 1>(t, data, data_size);
20522052
}
20532053
template <> int repack<float, 1, 128>(struct ggml_tensor * t, const void * data, size_t data_size) {
2054-
return repack_f32_to_f32_N_bl<128, 1>(t, data, data_size);
2054+
return repack_f32_to_f32_MxK_bl<128, 1>(t, data, data_size);
20552055
}
20562056

20572057
// gemv
@@ -2277,7 +2277,7 @@ template <typename BLOC_TYPE, int64_t NB_ROWS, int64_t INTER_SIZE, int64_t NB_CO
22772277

22782278
GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize);
22792279

2280-
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
2280+
// If there are more than `NB_ROWS` rows in src1, use gemm; otherwise, use gemv.
22812281
if (nrows > (NB_ROWS - 1)) {
22822282
gemm<BLOC_TYPE, NB_ROWS, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0,
22832283
src0_ptr + src0_start * nb01, src1_ptr,

ggml/src/ggml-cpu/repack.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,6 @@ void ggml_gemv_f16_1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, c
145145
void ggml_gemv_f16_1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
146146
void ggml_gemv_f16_1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
147147
void ggml_gemv_f16_1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
148-
void ggml_gemm_f16_4x1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
149148
void ggml_gemm_f16_7x1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
150149
void ggml_gemm_f16_7x1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
151150
void ggml_gemm_f16_7x1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -156,7 +155,6 @@ void ggml_gemv_f16_1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const voi
156155
void ggml_gemv_f16_1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
157156
void ggml_gemv_f16_1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
158157
void ggml_gemv_f16_1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
159-
void ggml_gemm_f16_4x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
160158
void ggml_gemm_f16_7x1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
161159
void ggml_gemm_f16_7x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
162160
void ggml_gemm_f16_7x1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

0 commit comments

Comments
 (0)