Skip to content

Commit 51311dd

Browse files
RehanQasim-devtaimur-10x
authored andcommitted
ggml-cpu: add vec_dot for iq2_xs, iq3_xxs
1 parent 748e622 commit 51311dd

2 files changed

Lines changed: 225 additions & 21 deletions

File tree

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,7 @@
131131
// quants.c
132132
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
133133
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
134-
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
135134
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
136-
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
137135
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
138136
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
139137
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K

ggml/src/ggml-cpu/arch/riscv/quants.c

Lines changed: 225 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2089,7 +2089,7 @@ static const int8_t keven_signs_q2xs[1024] = {
20892089
};
20902090
#endif
20912091

2092-
void ggml_vec_dot_iq2_xxs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2092+
static void ggml_vec_dot_iq2_xxs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
20932093
assert(n % QK_K == 0);
20942094
assert(nrc == 1);
20952095
UNUSED(nrc);
@@ -2116,7 +2116,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs,
21162116

21172117
float sum = 0.0f;
21182118

2119-
#pragma GCC nounroll
2119+
#pragma GCC unroll 1
21202120
for (int ib32 = 0; ib32 < QK_K / 32; ib32 += 2) {
21212121
vint8m2_t q8_1 = __riscv_vle8_v_i8m2(q8, 32); q8 += 32;
21222122
vint8m2_t q8_2 = __riscv_vle8_v_i8m2(q8, 32); q8 += 32;
@@ -2180,7 +2180,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs,
21802180
*s = 0.125f * sumf;
21812181
}
21822182

2183-
void ggml_vec_dot_iq2_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2183+
static void ggml_vec_dot_iq2_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
21842184
assert(n % QK_K == 0);
21852185
assert(nrc == 1);
21862186
UNUSED(nrc);
@@ -2278,16 +2278,18 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
22782278
#if defined __riscv_v_intrinsic
22792279
switch (__riscv_vlenb() * 8) {
22802280
case 128:
2281-
return ggml_vec_dot_iq2_xxs_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
2281+
ggml_vec_dot_iq2_xxs_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
2282+
break;
22822283
default:
2283-
return ggml_vec_dot_iq2_xxs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
2284+
ggml_vec_dot_iq2_xxs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
2285+
break;
22842286
}
22852287
#else
2286-
return ggml_vec_dot_iq2_xxs_q8_K(n, s, bs, vx, bx, vy, by, nrc);
2288+
ggml_vec_dot_iq2_xxs_q8_K(n, s, bs, vx, bx, vy, by, nrc);
22872289
#endif
22882290
}
22892291

2290-
void ggml_vec_dot_iq4_nl_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2292+
static void ggml_vec_dot_iq4_nl_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
22912293
assert(nrc == 1);
22922294
UNUSED(nrc);
22932295
UNUSED(bx);
@@ -2340,7 +2342,7 @@ void ggml_vec_dot_iq4_nl_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, c
23402342
*s = sumf;
23412343
}
23422344

2343-
void ggml_vec_dot_iq4_nl_q8_0_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2345+
static void ggml_vec_dot_iq4_nl_q8_0_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
23442346
assert(nrc == 1);
23452347
UNUSED(nrc);
23462348
UNUSED(bx);
@@ -2401,16 +2403,18 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
24012403
#if defined __riscv_v_intrinsic
24022404
switch (__riscv_vlenb() * 8) {
24032405
case 128:
2404-
return ggml_vec_dot_iq4_nl_q8_0_vl128(n, s, bs, vx, bx, vy, by, nrc);
2406+
ggml_vec_dot_iq4_nl_q8_0_vl128(n, s, bs, vx, bx, vy, by, nrc);
2407+
break;
24052408
default:
2406-
return ggml_vec_dot_iq4_nl_q8_0_vl256(n, s, bs, vx, bx, vy, by, nrc);
2409+
ggml_vec_dot_iq4_nl_q8_0_vl256(n, s, bs, vx, bx, vy, by, nrc);
2410+
break;
24072411
}
24082412
#else
2409-
return ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
2413+
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
24102414
#endif
24112415
}
24122416

2413-
void ggml_vec_dot_mxfp4_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2417+
static void ggml_vec_dot_mxfp4_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
24142418
assert(nrc == 1);
24152419
UNUSED(nrc);
24162420
UNUSED(bx);
@@ -2463,7 +2467,7 @@ void ggml_vec_dot_mxfp4_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, co
24632467
*s = sumf;
24642468
}
24652469

2466-
void ggml_vec_dot_mxfp4_q8_0_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2470+
static void ggml_vec_dot_mxfp4_q8_0_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
24672471
assert(nrc == 1);
24682472
UNUSED(nrc);
24692473
UNUSED(bx);
@@ -2524,16 +2528,18 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
25242528
#if defined __riscv_v_intrinsic
25252529
switch (__riscv_vlenb() * 8) {
25262530
case 128:
2527-
return ggml_vec_dot_mxfp4_q8_0_vl128(n, s, bs, vx, bx, vy, by, nrc);
2531+
ggml_vec_dot_mxfp4_q8_0_vl128(n, s, bs, vx, bx, vy, by, nrc);
2532+
break;
25282533
default:
2529-
return ggml_vec_dot_mxfp4_q8_0_vl256(n, s, bs, vx, bx, vy, by, nrc);
2534+
ggml_vec_dot_mxfp4_q8_0_vl256(n, s, bs, vx, bx, vy, by, nrc);
2535+
break;
25302536
}
25312537
#else
25322538
return ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
25332539
#endif
25342540
}
25352541

2536-
void ggml_vec_dot_iq4_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2542+
static void ggml_vec_dot_iq4_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
25372543
assert(nrc == 1);
25382544
UNUSED(nrc);
25392545
UNUSED(bx);
@@ -2621,11 +2627,211 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
26212627
#if defined __riscv_v_intrinsic
26222628
switch (__riscv_vlenb() * 8) {
26232629
case 256:
2624-
return ggml_vec_dot_iq4_xs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
2630+
ggml_vec_dot_iq4_xs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
2631+
break;
26252632
default:
2626-
return ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2633+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2634+
break;
26272635
}
26282636
#else
2629-
return ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2637+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2638+
#endif
2639+
}
2640+
2641+
static void ggml_vec_dot_iq3_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2642+
2643+
assert(n % QK_K == 0);
2644+
assert(nrc == 1);
2645+
UNUSED(nrc);
2646+
UNUSED(bx);
2647+
UNUSED(by);
2648+
UNUSED(bs);
2649+
2650+
const block_iq3_xxs * GGML_RESTRICT x = vx;
2651+
const block_q8_K * GGML_RESTRICT y = vy;
2652+
const int nb = n / QK_K;
2653+
2654+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
2655+
const uint32_t * grid32 = (const uint32_t *)iq3xxs_grid;
2656+
2657+
// constants for unpacking logic
2658+
const uint32_t shifts_val[8] = {0, 7, 14, 21, 0, 7, 14, 21};
2659+
vuint32m1_t v_shifts = __riscv_vle32_v_u32m1(shifts_val, 8);
2660+
2661+
const uint32_t gather_idx_val[8] = {0, 0, 0, 0, 1, 1, 1, 1};
2662+
vuint32m1_t v_gather_idx = __riscv_vle32_v_u32m1(gather_idx_val, 8);
2663+
2664+
uint32_t aux32[2];
2665+
float sumf = 0.0f;
2666+
2667+
for (int i = 0; i < nb; ++i) {
2668+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2669+
2670+
const uint8_t * GGML_RESTRICT q3_indices = x[i].qs;
2671+
const uint8_t * GGML_RESTRICT metadata = x[i].qs + QK_K/4;
2672+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
2673+
2674+
float block_sum = 0.0f;
2675+
2676+
for (int ib = 0; ib < QK_K / 64; ++ib) {
2677+
// Load q8 (64 bytes)
2678+
vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
2679+
q8 += 64;
2680+
2681+
// load of metadata via memcpy
2682+
memcpy(aux32, metadata, 2 * sizeof(uint32_t));
2683+
metadata += 2 * sizeof(uint32_t);
2684+
2685+
// Load q3 indices and gather magnitudes
2686+
vuint8mf2_t v_q3_idx_u8 = __riscv_vle8_v_u8mf2(q3_indices, 16);
2687+
q3_indices += 16;
2688+
2689+
vuint16m1_t v_q3_idx_u16 = __riscv_vwmulu_vx_u16m1(v_q3_idx_u8, 4, 16);
2690+
vuint32m2_t v_q3_magnitudes_u32 = __riscv_vluxei16_v_u32m2(grid32, v_q3_idx_u16, 16);
2691+
vint8m2_t v_q3_magnitudes = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u32m2_u8m2(v_q3_magnitudes_u32));
2692+
2693+
// --- Unpacking of Sign Indices ---
2694+
2695+
// 1. Load the 2 auxiliary 32-bit integers into a vector
2696+
vuint32m1_t v_aux = __riscv_vle32_v_u32m1(aux32, 2);
2697+
2698+
// 2. Broadcast/Gather: replicate aux[0] to first 4 lanes, aux[1] to next 4 lanes
2699+
vuint32m1_t v_aux_expanded = __riscv_vrgather_vv_u32m1(v_aux, v_gather_idx, 8);
2700+
2701+
// 3. Apply Shifts and Mask: ((val >> shift) & 127)
2702+
vuint32m1_t v_s_vals_raw = __riscv_vand_vx_u32m1(__riscv_vsrl_vv_u32m1(v_aux_expanded, v_shifts, 8), 127, 8);
2703+
2704+
// 4. Narrow to u16 (required for vluxei index) and multiply by 8 (byte offset for u64 table)
2705+
vuint16mf2_t sign_indices_byte_offset = __riscv_vsll_vx_u16mf2(__riscv_vncvt_x_x_w_u16mf2(v_s_vals_raw, 8), 3, 8);
2706+
2707+
// 5. Gather Signs
2708+
vuint64m2_t v_s_vals_u64 = __riscv_vluxei16_v_u64m2(signs64, sign_indices_byte_offset, 8);
2709+
vint8m2_t v_s_vals = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(v_s_vals_u64));
2710+
2711+
vint8m2_t v_q3_signed = __riscv_vmul_vv_i8m2(v_q3_magnitudes, v_s_vals, 64);
2712+
vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_q8, v_q3_signed, 64);
2713+
2714+
vint16m2_t v_dot_1 = __riscv_vget_v_i16m4_i16m2(v_dot, 0);
2715+
vint16m2_t v_dot_2 = __riscv_vget_v_i16m4_i16m2(v_dot, 1);
2716+
2717+
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
2718+
vint32m1_t v_sum_1 = __riscv_vwredsum_vs_i16m2_i32m1(v_dot_1, v_zero, 32);
2719+
vint32m1_t v_sum_2 = __riscv_vwredsum_vs_i16m2_i32m1(v_dot_2, v_zero, 32);
2720+
2721+
int32_t sum1_i = __riscv_vmv_x_s_i32m1_i32(v_sum_1);
2722+
int32_t sum2_i = __riscv_vmv_x_s_i32m1_i32(v_sum_2);
2723+
2724+
const float scale1_f = (float)(2 * (aux32[0] >> 28) + 1);
2725+
const float scale2_f = (float)(2 * (aux32[1] >> 28) + 1);
2726+
2727+
block_sum += sum1_i * scale1_f + sum2_i * scale2_f;
2728+
}
2729+
2730+
sumf += d * block_sum;
2731+
}
2732+
*s = 0.25f * sumf;
2733+
}
2734+
2735+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2736+
#if defined __riscv_v_intrinsic
2737+
switch (__riscv_vlenb() * 8) {
2738+
case 128:
2739+
return ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2740+
default:
2741+
return ggml_vec_dot_iq3_xxs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
2742+
}
2743+
#endif
2744+
return ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2745+
}
2746+
2747+
static void ggml_vec_dot_iq2_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2748+
assert(n % QK_K == 0);
2749+
assert(nrc == 1);
2750+
UNUSED(nrc);
2751+
UNUSED(bx);
2752+
UNUSED(by);
2753+
UNUSED(bs);
2754+
2755+
const block_iq2_xs * GGML_RESTRICT x = vx;
2756+
const block_q8_K * GGML_RESTRICT y = vy;
2757+
2758+
const int nb = n / QK_K;
2759+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
2760+
const uint64_t * grid64 = (const uint64_t *)iq2xs_grid;
2761+
2762+
float sumf = 0.0f;
2763+
2764+
for (int i = 0; i < nb; ++i) {
2765+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2766+
const uint16_t * GGML_RESTRICT qs = x[i].qs;
2767+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
2768+
const uint8_t * GGML_RESTRICT scales = x[i].scales;
2769+
2770+
int32_t sum_int = 0;
2771+
2772+
// Loop over 4 subblocks of 64 elements (QK_K = 256)
2773+
for (int ib64 = 0; ib64 < QK_K / 64; ++ib64) {
2774+
// Load 8 uint16 indices (controls 64 values)
2775+
vuint16mf2_t v_qs = __riscv_vle16_v_u16mf2(qs, 8);
2776+
qs += 8;
2777+
2778+
// Extract indices for grid (low 9 bits) and signs (high 7 bits)
2779+
// Multiply by 8 (<< 3) for byte offsets into the uint64 tables
2780+
vuint16mf2_t vidx_grid = __riscv_vsll_vx_u16mf2(__riscv_vand_vx_u16mf2(v_qs, 511, 8), 3, 8);
2781+
vuint16mf2_t vidx_sign = __riscv_vsll_vx_u16mf2(__riscv_vsrl_vx_u16mf2(v_qs, 9, 8), 3, 8);
2782+
2783+
vuint64m2_t vq2_64 = __riscv_vluxei16_v_u64m2(grid64, vidx_grid, 8);
2784+
vuint64m2_t vs2_64 = __riscv_vluxei16_v_u64m2(signs64, vidx_sign, 8);
2785+
2786+
vint8m2_t q2u = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(vq2_64));
2787+
vint8m2_t q2s = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(vs2_64));
2788+
2789+
// Apply signs
2790+
vint8m2_t q2_final = __riscv_vmul_vv_i8m2(q2u, q2s, 64);
2791+
2792+
// Load Q8 weights (64 elements)
2793+
vint8m2_t q8v = __riscv_vle8_v_i8m2(q8, 64);
2794+
q8 += 64;
2795+
2796+
// Multiply (Widening to int16, 64 elements -> LMUL=4)
2797+
vint16m4_t prod = __riscv_vwmul_vv_i16m4(q2_final, q8v, 64);
2798+
2799+
// Reduction
2800+
vint32m1_t zero_vec = __riscv_vmv_v_x_i32m1(0, 1);
2801+
2802+
int32_t sum0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
2803+
__riscv_vget_v_i16m4_i16m1(prod, 0), zero_vec, 16));
2804+
int32_t sum1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
2805+
__riscv_vget_v_i16m4_i16m1(prod, 1), zero_vec, 16));
2806+
int32_t sum2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
2807+
__riscv_vget_v_i16m4_i16m1(prod, 2), zero_vec, 16));
2808+
int32_t sum3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
2809+
__riscv_vget_v_i16m4_i16m1(prod, 3), zero_vec, 16));
2810+
2811+
// Apply Scales
2812+
const uint8_t scale_byte_1 = scales[0];
2813+
const uint8_t scale_byte_2 = scales[1];
2814+
scales += 2;
2815+
2816+
sum_int += sum0 * ((scale_byte_1 & 0x0F) * 2 + 1);
2817+
sum_int += sum1 * ((scale_byte_1 >> 4) * 2 + 1);
2818+
sum_int += sum2 * ((scale_byte_2 & 0x0F) * 2 + 1);
2819+
sum_int += sum3 * ((scale_byte_2 >> 4) * 2 + 1);
2820+
}
2821+
2822+
sumf += d * sum_int;
2823+
}
2824+
*s = 0.125f * sumf;
2825+
}
2826+
2827+
void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2828+
#if defined __riscv_v_intrinsic
2829+
switch (__riscv_vlenb() * 8) {
2830+
case 128:
2831+
return ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2832+
default:
2833+
return ggml_vec_dot_iq2_xs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
2834+
}
26302835
#endif
2836+
return ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
26312837
}

0 commit comments

Comments
 (0)