Skip to content

Commit c640f8b

Browse files
committed
ggml-cpu: add rvv repacking for q8_0
1 parent 090e5e6 commit c640f8b

3 files changed

Lines changed: 209 additions & 4 deletions

File tree

ggml/src/ggml-cpu/arch/riscv/repack.cpp

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,59 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
203203
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
204204
}
205205

206+
void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
207+
const int qk = QK8_0;
208+
const int nb = n / qk;
209+
const int ncols_interleaved = 16;
210+
const int blocklen = 1;
211+
212+
assert (n % qk == 0);
213+
assert (nc % ncols_interleaved == 0);
214+
215+
UNUSED(s);
216+
UNUSED(bs);
217+
UNUSED(vx);
218+
UNUSED(vy);
219+
UNUSED(nr);
220+
UNUSED(nc);
221+
UNUSED(nb);
222+
UNUSED(ncols_interleaved);
223+
UNUSED(blocklen);
224+
225+
#if defined __riscv_v_intrinsic
226+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
227+
for (int x = 0; x < nc / ncols_interleaved; x++) {
228+
const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
229+
230+
// 1x16 Accumulator1
231+
vfloat32m2_t sumf = __riscv_vfmv_v_f_f32m2(0.0f, 16);
232+
233+
for (int l = 0; l < nb; l++) {
234+
// 1x32 integer accumulator
235+
vint32m2_t sumi = __riscv_vmv_v_x_i32m2(0.0f, 16);
236+
237+
// Accumulation loop.
238+
for (int i = 0; i < QK8_0; i++) {
239+
// Load `b_ptr`.
240+
const vint8mf2_t b_0 = __riscv_vle8_v_i8mf2((const int8_t *)&b_ptr[l].qs[i * 16], 16);
241+
// const vint16m1_t b_0_16 = __riscv_vwcvt_x_x_v_i16m1(b_0, 16);
242+
243+
sumi = __riscv_vwadd_wv_i32m2(sumi, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i], 16), 16);
244+
}
245+
246+
const vfloat16m1_t b_d = __riscv_vle16_v_f16m1((_Float16 *)b_ptr[l].d, 16);
247+
const vfloat32m2_t d_0 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d, 16);
248+
249+
sumf = __riscv_vfmacc_vv_f32m2(sumf, __riscv_vfcvt_f_x_v_f32m2(sumi, 16), d_0, 16);
250+
}
251+
252+
__riscv_vse32_v_f32m2(s + x * 16, sumf, 16);
253+
}
254+
return;
255+
#endif
256+
ggml_gemv_q8_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
257+
}
258+
206259
void ggml_gemv_iq4_nl_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
207260
const int qk = QK8_0;
208261
const int nb = n / qk;
@@ -638,6 +691,80 @@ void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
638691
ggml_gemm_iq4_nl_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
639692
}
640693

694+
void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
695+
const int qk = QK8_0;
696+
const int nb = n / qk;
697+
const int ncols_interleaved = 16;
698+
const int blocklen = 1;
699+
700+
assert (n % qk == 0);
701+
assert (nr % 4 == 0);
702+
assert (nc % ncols_interleaved == 0);
703+
704+
UNUSED(s);
705+
UNUSED(bs);
706+
UNUSED(vx);
707+
UNUSED(vy);
708+
UNUSED(nr);
709+
UNUSED(nc);
710+
UNUSED(nb);
711+
UNUSED(ncols_interleaved);
712+
UNUSED(blocklen);
713+
714+
#if defined __riscv_v_intrinsic
715+
for (int y = 0; y < nr / 4; y++) {
716+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
717+
for (int x = 0; x < nc / ncols_interleaved; x++) {
718+
const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
719+
720+
// 4x16 Accumulators
721+
vfloat32m2_t sumf_0 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
722+
vfloat32m2_t sumf_1 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
723+
vfloat32m2_t sumf_2 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
724+
vfloat32m2_t sumf_3 = __riscv_vfmv_v_f_f32m2(0.0f, 16);
725+
726+
for (int l = 0; l < nb; l++) {
727+
// 4x16 integer accumulators
728+
vint32m2_t sumi_0 = __riscv_vmv_v_x_i32m2(0.0f, 16);
729+
vint32m2_t sumi_1 = __riscv_vmv_v_x_i32m2(0.0f, 16);
730+
vint32m2_t sumi_2 = __riscv_vmv_v_x_i32m2(0.0f, 16);
731+
vint32m2_t sumi_3 = __riscv_vmv_v_x_i32m2(0.0f, 16);
732+
733+
// Accumulation loop.
734+
for (int i = 0; i < QK8_0; i++) {
735+
// Load `b_ptr`.
736+
const vint8mf2_t b_0 = __riscv_vle8_v_i8mf2((const int8_t *)&b_ptr[l].qs[i * 16], 16);
737+
// const vint16m1_t b_0_16 = __riscv_vwcvt_x_x_v_i16m1(b_0, 16);
738+
739+
sumi_0 = __riscv_vwadd_wv_i32m2(sumi_0, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 0], 16), 16);
740+
sumi_1 = __riscv_vwadd_wv_i32m2(sumi_1, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 1], 16), 16);
741+
sumi_2 = __riscv_vwadd_wv_i32m2(sumi_2, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 2], 16), 16);
742+
sumi_3 = __riscv_vwadd_wv_i32m2(sumi_3, __riscv_vwmul_vx_i16m1(b_0, a_ptr[l].qs[i * 4 + 3], 16), 16);
743+
}
744+
745+
const vfloat16m1_t b_d = __riscv_vle16_v_f16m1((_Float16 *)b_ptr[l].d, 16);
746+
const vfloat32m2_t d_0 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[0], 16);
747+
const vfloat32m2_t d_1 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[1], 16);
748+
const vfloat32m2_t d_2 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[2], 16);
749+
const vfloat32m2_t d_3 = __riscv_vfwmul_vf_f32m2(b_d, *(const _Float16 *)&a_ptr[l].d[3], 16);
750+
751+
sumf_0 = __riscv_vfmacc_vv_f32m2(sumf_0, __riscv_vfcvt_f_x_v_f32m2(sumi_0, 16), d_0, 16);
752+
sumf_1 = __riscv_vfmacc_vv_f32m2(sumf_1, __riscv_vfcvt_f_x_v_f32m2(sumi_1, 16), d_1, 16);
753+
sumf_2 = __riscv_vfmacc_vv_f32m2(sumf_2, __riscv_vfcvt_f_x_v_f32m2(sumi_2, 16), d_2, 16);
754+
sumf_3 = __riscv_vfmacc_vv_f32m2(sumf_3, __riscv_vfcvt_f_x_v_f32m2(sumi_3, 16), d_3, 16);
755+
}
756+
757+
__riscv_vse32_v_f32m2(s + (y * 4 + 0) * bs + x * 16, sumf_0, 16);
758+
__riscv_vse32_v_f32m2(s + (y * 4 + 1) * bs + x * 16, sumf_1, 16);
759+
__riscv_vse32_v_f32m2(s + (y * 4 + 2) * bs + x * 16, sumf_2, 16);
760+
__riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16);
761+
}
762+
}
763+
return;
764+
#endif
765+
ggml_gemm_q8_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc);
766+
}
767+
641768
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
642769
const int qk = QK8_0;
643770
const int nb = n / qk;

ggml/src/ggml-cpu/repack.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2447,6 +2447,55 @@ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
24472447
return 0;
24482448
}
24492449

2450+
static block_q8_0x16 make_block_q8_0x16(block_q8_0 * in, unsigned int blck_size_interleave) {
2451+
block_q8_0x16 out;
2452+
2453+
for (int i = 0; i < 16; i++) {
2454+
out.d[i] = in[i].d;
2455+
}
2456+
2457+
const int end = QK8_0 * 16 / blck_size_interleave;
2458+
for (int i = 0; i < end; ++i) {
2459+
int src_id = i % 16;
2460+
int src_offset = (i / 16) * blck_size_interleave;
2461+
int dst_offset = i * blck_size_interleave;
2462+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
2463+
}
2464+
2465+
return out;
2466+
}
2467+
2468+
static int repack_q8_0_to_q8_0_16_bl(struct ggml_tensor * t,
2469+
int interleave_block,
2470+
const void * GGML_RESTRICT data,
2471+
size_t data_size) {
2472+
GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
2473+
constexpr int nrows_interleaved = 16;
2474+
2475+
block_q8_0x16 * dst = (block_q8_0x16 *) t->data;
2476+
const block_q8_0 * src = (const block_q8_0 *) data;
2477+
block_q8_0 dst_tmp[16];
2478+
int nrow = ggml_nrows(t);
2479+
int nblocks = t->ne[0] / QK8_0;
2480+
2481+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
2482+
2483+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
2484+
return -1;
2485+
}
2486+
2487+
for (int b = 0; b < nrow; b += nrows_interleaved) {
2488+
for (int64_t x = 0; x < nblocks; x++) {
2489+
for (int i = 0; i < nrows_interleaved; i++) {
2490+
dst_tmp[i] = src[x + i * nblocks];
2491+
}
2492+
*dst++ = make_block_q8_0x16(dst_tmp, interleave_block);
2493+
}
2494+
src += nrows_interleaved * nblocks;
2495+
}
2496+
return 0;
2497+
}
2498+
24502499
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
24512500
block_iq4_nlx4 out;
24522501

@@ -2732,6 +2781,10 @@ template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * da
27322781
return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
27332782
}
27342783

2784+
template <> int repack<block_q8_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
2785+
return repack_q8_0_to_q8_0_16_bl(t, 1, data, data_size);
2786+
}
2787+
27352788
// gemv
27362789
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
27372790
void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -2792,6 +2845,10 @@ template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
27922845
ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
27932846
}
27942847

2848+
template <> void gemv<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2849+
ggml_gemv_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
2850+
}
2851+
27952852
// gemm
27962853
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
27972854
void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -2852,6 +2909,10 @@ template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
28522909
ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
28532910
}
28542911

2912+
template <> void gemm<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2913+
ggml_gemm_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
2914+
}
2915+
28552916
class tensor_traits_base : public ggml::cpu::tensor_traits {
28562917
public:
28572918
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -3255,6 +3316,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
32553316
// instance for Q8_0
32563317
static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
32573318
static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
3319+
static const ggml::cpu::repack::tensor_traits<block_q8_0, 1, 16, GGML_TYPE_Q8_0> q8_0_16x1_q8_0;
32583320

32593321
if (cur->type == GGML_TYPE_Q4_0) {
32603322
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
@@ -3334,6 +3396,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
33343396
return &q8_0_4x4_q8_0;
33353397
}
33363398
}
3399+
if (ggml_cpu_has_riscv_v()) {
3400+
#if defined __riscv_zvfh
3401+
switch (__riscv_vlenb() * 8) {
3402+
case 128: { break; } // TODO
3403+
case 256: { if (cur->ne[1] % 16 == 0) { return &q8_0_16x1_q8_0; } break; }
3404+
case 512: { break; } // TODO
3405+
case 1024: { break; } // TODO
3406+
default: { return nullptr; }
3407+
}
3408+
#endif
3409+
}
33373410
}
33383411

33393412
return nullptr;

ggml/src/ggml-cpu/repack.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ using block_q4_0x4 = block<4, 4>;
3535
using block_q4_0x8 = block<4, 8>;
3636
using block_q8_0x4 = block<8, 4>;
3737
using block_q8_0x8 = block<8, 8>;
38+
using block_q8_0x16 = block<8, 16>;
3839

3940
struct block_q4_Kx8 {
4041
ggml_half d[8]; // super-block scale for quantized scales
@@ -117,6 +118,9 @@ void ggml_gemv_iq4_nl_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
117118
void ggml_gemv_iq4_nl_4x16_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
118119
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
119120
void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
121+
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
122+
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
123+
void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
120124
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
121125
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
122126
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -129,10 +133,9 @@ void ggml_gemm_iq4_nl_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
129133
void ggml_gemm_iq4_nl_4x16_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
130134
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
131135
void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
132-
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
133-
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
134136
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
135137
void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
138+
void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
136139

137140
// Native implementations
138141
void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -153,6 +156,9 @@ void ggml_gemv_iq4_nl_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
153156
void ggml_gemv_iq4_nl_4x16_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
154157
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
155158
void ggml_gemv_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
159+
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
160+
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
161+
void ggml_gemv_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
156162
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
157163
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
158164
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -165,10 +171,9 @@ void ggml_gemm_iq4_nl_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
165171
void ggml_gemm_iq4_nl_4x16_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
166172
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
167173
void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
168-
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
169-
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
170174
void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
171175
void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
176+
void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
172177

173178
#if defined(__cplusplus)
174179
} // extern "C"

0 commit comments

Comments
 (0)