Skip to content

Commit 49de92f

Browse files
committed
ggml-cpu: add rvv repacking for q8_0
1 parent b90ab0b commit 49de92f

3 files changed

Lines changed: 209 additions & 4 deletions

File tree

ggml/src/ggml-cpu/arch/riscv/repack.cpp

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,59 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
203203
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
204204
}
205205

206+
void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
207+
const int qk = QK8_0;
208+
const int nb = n / qk;
209+
const int ncols_interleaved = 16;
210+
const int blocklen = 1;
211+
212+
assert (n % qk == 0);
213+
assert (nc % ncols_interleaved == 0);
214+
215+
UNUSED(s);
216+
UNUSED(bs);
217+
UNUSED(vx);
218+
UNUSED(vy);
219+
UNUSED(nr);
220+
UNUSED(nc);
221+
UNUSED(nb);
222+
UNUSED(ncols_interleaved);
223+
UNUSED(blocklen);
224+
225+
#if defined __riscv_v_intrinsic
226+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
227+
for (int x = 0; x < nc / ncols_interleaved; x++) {
228+
const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
229+
230+
// 1x16 Accumulator1
231+
vfloat32m2_t sumf = __riscv_vfmv_v_f_f32m2(0.0f, 16);
232+
233+
for (int l = 0; l < nb; l++) {
234+
// 1x32 integer accumulator
235+
vint32m2_t sumi = __riscv_vmv_v_x_i32m2(0.0f, 16);
236+
237+
// Accumulation loop.
238+
for (int i = 0; i < QK4_NL / 2; i++) {
239+
// Load `b_ptr`.
240+
const vint8mf2_t b_0 = __riscv_vle8_v_i8mf2((const int8_t *)&b_ptr[l].qs[i * 16], 16);
241+
// const vint16m1_t b_0_16 = __riscv_vwcvt_x_x_v_i16m1(b_0, 16);
242+
243+
sumi = __riscv_vwadd_wv_i32m2(sumi, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i], 16), 16);
244+
}
245+
246+
const vfloat16m1_t b_d = __riscv_vle16_v_f16m1((_Float16 *)b_ptr[l].d, 16);
247+
const vfloat32m2_t d_0 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d, 16);
248+
249+
sumf = __riscv_vfmacc_vv_f32m2(sumf, __riscv_vfcvt_f_x_v_f32m2(sumi, 16), d_0, 16);
250+
}
251+
252+
__riscv_vse32_v_f32m2(s + x * 16, sumf, 16);
253+
}
254+
return;
255+
#endif
256+
ggml_gemv_q8_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
257+
}
258+
206259
void ggml_gemv_iq4_nl_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
207260
const int qk = QK8_0;
208261
const int nb = n / qk;
@@ -638,6 +691,80 @@ void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
638691
ggml_gemm_iq4_nl_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
639692
}
640693

694+
void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
695+
const int qk = QK8_0;
696+
const int nb = n / qk;
697+
const int ncols_interleaved = 16;
698+
const int blocklen = 1;
699+
700+
assert (n % qk == 0);
701+
assert (nr % 4 == 0);
702+
assert (nc % ncols_interleaved == 0);
703+
704+
UNUSED(s);
705+
UNUSED(bs);
706+
UNUSED(vx);
707+
UNUSED(vy);
708+
UNUSED(nr);
709+
UNUSED(nc);
710+
UNUSED(nb);
711+
UNUSED(ncols_interleaved);
712+
UNUSED(blocklen);
713+
714+
#if defined __riscv_v_intrinsic
715+
for (int y = 0; y < nr / 4; y++) {
716+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
717+
for (int x = 0; x < nc / ncols_interleaved; x++) {
718+
const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
719+
720+
// 4x16 Accumulators
721+
vfloat32m2_t sumf_0 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
722+
vfloat32m2_t sumf_1 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
723+
vfloat32m2_t sumf_2 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
724+
vfloat32m2_t sumf_3 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
725+
726+
for (int l = 0; l < nb; l++) {
727+
// 4x16 integer accumulators
728+
vint32m2_t sumi_0 = __riscv_vmv_v_x_i32m2(0.0f, 16);
729+
vint32m2_t sumi_1 = __riscv_vmv_v_x_i32m2(0.0f, 16);
730+
vint32m2_t sumi_2 = __riscv_vmv_v_x_i32m2(0.0f, 16);
731+
vint32m2_t sumi_3 = __riscv_vmv_v_x_i32m2(0.0f, 16);
732+
733+
// Accumulation loop.
734+
for (int i = 0; i < QK8_0; i++) {
735+
// Load `b_ptr`.
736+
const vint8mf2_t b_0 = __riscv_vle8_v_i8mf2((const int8_t *)&b_ptr[l].qs[i * 16], 16);
737+
// const vint16m1_t b_0_16 = __riscv_vwcvt_x_x_v_i16m1(b_0, 16);
738+
739+
sumi_0 = __riscv_vwadd_wv_i32m2(sumi_0, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 0], 16), 16);
740+
sumi_1 = __riscv_vwadd_wv_i32m2(sumi_1, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 1], 16), 16);
741+
sumi_2 = __riscv_vwadd_wv_i32m2(sumi_2, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 2], 16), 16);
742+
sumi_3 = __riscv_vwadd_wv_i32m2(sumi_3, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 3], 16), 16);
743+
}
744+
745+
const vfloat16m1_t b_d = __riscv_vle16_v_f16m1((_Float16 *)b_ptr[l].d, 16);
746+
const vfloat32m2_t d_0 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[0], 16);
747+
const vfloat32m2_t d_1 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[1], 16);
748+
const vfloat32m2_t d_2 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[2], 16);
749+
const vfloat32m2_t d_3 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[3], 16);
750+
751+
sumf_0 = __riscv_vfmacc_vv_f32m2(sumf_0, __riscv_vfcvt_f_x_v_f32m2(sumi_0, 16), d_0, 16);
752+
sumf_1 = __riscv_vfmacc_vv_f32m2(sumf_1, __riscv_vfcvt_f_x_v_f32m2(sumi_1, 16), d_1, 16);
753+
sumf_2 = __riscv_vfmacc_vv_f32m2(sumf_2, __riscv_vfcvt_f_x_v_f32m2(sumi_2, 16), d_2, 16);
754+
sumf_3 = __riscv_vfmacc_vv_f32m2(sumf_3, __riscv_vfcvt_f_x_v_f32m2(sumi_3, 16), d_3, 16);
755+
}
756+
757+
__riscv_vse32_v_f32m2(s + (y * 4 + 0) * bs + x * 16, sumf_0, 16);
758+
__riscv_vse32_v_f32m2(s + (y * 4 + 1) * bs + x * 16, sumf_1, 16);
759+
__riscv_vse32_v_f32m2(s + (y * 4 + 2) * bs + x * 16, sumf_2, 16);
760+
__riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16);
761+
}
762+
}
763+
return;
764+
#endif
765+
ggml_gemm_q8_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
766+
}
767+
641768
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
642769
const int qk = QK8_0;
643770
const int nb = n / qk;

ggml/src/ggml-cpu/repack.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2137,6 +2137,55 @@ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
21372137
return 0;
21382138
}
21392139

2140+
static block_q8_0x16 make_block_q8_0x16(block_q8_0 * in, unsigned int blck_size_interleave) {
2141+
block_q8_0x16 out;
2142+
2143+
for (int i = 0; i < 16; i++) {
2144+
out.d[i] = in[i].d;
2145+
}
2146+
2147+
const int end = QK8_0 * 16 / blck_size_interleave;
2148+
for (int i = 0; i < end; ++i) {
2149+
int src_id = i % 16;
2150+
int src_offset = (i / 16) * blck_size_interleave;
2151+
int dst_offset = i * blck_size_interleave;
2152+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
2153+
}
2154+
2155+
return out;
2156+
}
2157+
2158+
static int repack_q8_0_to_q8_0_16_bl(struct ggml_tensor * t,
2159+
int interleave_block,
2160+
const void * GGML_RESTRICT data,
2161+
size_t data_size) {
2162+
GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
2163+
constexpr int nrows_interleaved = 16;
2164+
2165+
block_q8_0x16 * dst = (block_q8_0x16 *) t->data;
2166+
const block_q8_0 * src = (const block_q8_0 *) data;
2167+
block_q8_0 dst_tmp[4];
2168+
int nrow = ggml_nrows(t);
2169+
int nblocks = t->ne[0] / QK8_0;
2170+
2171+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
2172+
2173+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
2174+
return -1;
2175+
}
2176+
2177+
for (int b = 0; b < nrow; b += nrows_interleaved) {
2178+
for (int64_t x = 0; x < nblocks; x++) {
2179+
for (int i = 0; i < nrows_interleaved; i++) {
2180+
dst_tmp[i] = src[x + i * nblocks];
2181+
}
2182+
*dst++ = make_block_q8_0x16(dst_tmp, interleave_block);
2183+
}
2184+
src += nrows_interleaved * nblocks;
2185+
}
2186+
return 0;
2187+
}
2188+
21402189
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
21412190
block_iq4_nlx4 out;
21422191

@@ -2418,6 +2467,10 @@ template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * da
24182467
return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
24192468
}
24202469

2470+
template <> int repack<block_q8_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
2471+
return repack_q8_0_to_q8_0_16_bl(t, 8, data, data_size);
2472+
}
2473+
24212474
// gemv
24222475
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
24232476
void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -2474,6 +2527,10 @@ template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
24742527
ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
24752528
}
24762529

2530+
template <> void gemv<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2531+
ggml_gemv_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
2532+
}
2533+
24772534
// gemm
24782535
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
24792536
void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -2530,6 +2587,10 @@ template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
25302587
ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
25312588
}
25322589

2590+
template <> void gemm<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2591+
ggml_gemm_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
2592+
}
2593+
25332594
class tensor_traits_base : public ggml::cpu::tensor_traits {
25342595
public:
25352596
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -2930,6 +2991,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
29302991
// instance for Q8_0
29312992
static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
29322993
static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
2994+
static const ggml::cpu::repack::tensor_traits<block_q8_0, 1, 16, GGML_TYPE_Q8_0> q8_0_16x1_q8_0;
29332995

29342996
if (cur->type == GGML_TYPE_Q4_0) {
29352997
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
@@ -3003,6 +3065,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
30033065
return &q8_0_4x4_q8_0;
30043066
}
30053067
}
3068+
if (ggml_cpu_has_riscv_v()) {
3069+
#if defined __riscv_zvfh
3070+
switch (__riscv_vlenb() * 8) {
3071+
case 128: { break; } // TODO
3072+
case 256: { if (cur->ne[1] % 16 == 0) { return &q8_0_16x1_q8_0; } break; }
3073+
case 512: { break; } // TODO
3074+
case 1024: { break; } // TODO
3075+
default: { return nullptr; }
3076+
}
3077+
#endif
3078+
}
30063079
}
30073080

30083081
return nullptr;

ggml/src/ggml-cpu/repack.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ using block_q4_0x4 = block<4, 4>;
3535
using block_q4_0x8 = block<4, 8>;
3636
using block_q8_0x4 = block<8, 4>;
3737
using block_q8_0x8 = block<8, 8>;
38+
using block_q8_0x16 = block<8, 16>;
3839

3940
struct block_q4_Kx8 {
4041
ggml_half d[8]; // super-block scale for quantized scales
@@ -103,6 +104,9 @@ void ggml_gemv_iq4_nl_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
103104
void ggml_gemv_iq4_nl_4x16_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104105
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
105106
void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
107+
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
108+
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
109+
void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
106110
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
107111
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
108112
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -114,10 +118,9 @@ void ggml_gemm_iq4_nl_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
114118
void ggml_gemm_iq4_nl_4x16_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
115119
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
116120
void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
117-
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
118-
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
119121
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
120122
void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
123+
void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
121124

122125
// Native implementations
123126
void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -137,6 +140,9 @@ void ggml_gemv_iq4_nl_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
137140
void ggml_gemv_iq4_nl_4x16_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
138141
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
139142
void ggml_gemv_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
143+
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
144+
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
145+
void ggml_gemv_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
140146
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
141147
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
142148
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -148,10 +154,9 @@ void ggml_gemm_iq4_nl_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
148154
void ggml_gemm_iq4_nl_4x16_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
149155
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
150156
void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
151-
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
152-
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
153157
void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
154158
void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
159+
void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
155160

156161
#if defined(__cplusplus)
157162
} // extern "C"

0 commit comments

Comments
 (0)