@@ -2089,7 +2089,7 @@ static const int8_t keven_signs_q2xs[1024] = {
20892089};
20902090#endif
20912091
2092- void ggml_vec_dot_iq2_xxs_q8_K_vl128 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2092+ static void ggml_vec_dot_iq2_xxs_q8_K_vl128 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
20932093 assert (n % QK_K == 0 );
20942094 assert (nrc == 1 );
20952095 UNUSED (nrc );
@@ -2116,7 +2116,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs,
21162116
21172117 float sum = 0.0f ;
21182118
2119- #pragma GCC nounroll
2119+ #pragma GCC unroll 1
21202120 for (int ib32 = 0 ; ib32 < QK_K / 32 ; ib32 += 2 ) {
21212121 vint8m2_t q8_1 = __riscv_vle8_v_i8m2 (q8 , 32 ); q8 += 32 ;
21222122 vint8m2_t q8_2 = __riscv_vle8_v_i8m2 (q8 , 32 ); q8 += 32 ;
@@ -2180,7 +2180,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs,
21802180 * s = 0.125f * sumf ;
21812181}
21822182
2183- void ggml_vec_dot_iq2_xxs_q8_K_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2183+ static void ggml_vec_dot_iq2_xxs_q8_K_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
21842184 assert (n % QK_K == 0 );
21852185 assert (nrc == 1 );
21862186 UNUSED (nrc );
@@ -2278,16 +2278,18 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
22782278#if defined __riscv_v_intrinsic
22792279 switch (__riscv_vlenb () * 8 ) {
22802280 case 128 :
2281- return ggml_vec_dot_iq2_xxs_q8_K_vl128 (n , s , bs , vx , bx , vy , by , nrc );
2281+ ggml_vec_dot_iq2_xxs_q8_K_vl128 (n , s , bs , vx , bx , vy , by , nrc );
2282+ break ;
22822283 default :
2283- return ggml_vec_dot_iq2_xxs_q8_K_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2284+ ggml_vec_dot_iq2_xxs_q8_K_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2285+ break ;
22842286 }
22852287#else
2286- return ggml_vec_dot_iq2_xxs_q8_K (n , s , bs , vx , bx , vy , by , nrc );
2288+ ggml_vec_dot_iq2_xxs_q8_K (n , s , bs , vx , bx , vy , by , nrc );
22872289#endif
22882290}
22892291
2290- void ggml_vec_dot_iq4_nl_q8_0_vl128 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2292+ static void ggml_vec_dot_iq4_nl_q8_0_vl128 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
22912293 assert (nrc == 1 );
22922294 UNUSED (nrc );
22932295 UNUSED (bx );
@@ -2340,7 +2342,7 @@ void ggml_vec_dot_iq4_nl_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, c
23402342 * s = sumf ;
23412343}
23422344
2343- void ggml_vec_dot_iq4_nl_q8_0_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2345+ static void ggml_vec_dot_iq4_nl_q8_0_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
23442346 assert (nrc == 1 );
23452347 UNUSED (nrc );
23462348 UNUSED (bx );
@@ -2401,16 +2403,18 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
24012403#if defined __riscv_v_intrinsic
24022404 switch (__riscv_vlenb () * 8 ) {
24032405 case 128 :
2404- return ggml_vec_dot_iq4_nl_q8_0_vl128 (n , s , bs , vx , bx , vy , by , nrc );
2406+ ggml_vec_dot_iq4_nl_q8_0_vl128 (n , s , bs , vx , bx , vy , by , nrc );
2407+ break ;
24052408 default :
2406- return ggml_vec_dot_iq4_nl_q8_0_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2409+ ggml_vec_dot_iq4_nl_q8_0_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2410+ break ;
24072411 }
24082412#else
2409- return ggml_vec_dot_iq4_nl_q8_0_generic (n , s , bs , vx , bx , vy , by , nrc );
2413+ ggml_vec_dot_iq4_nl_q8_0_generic (n , s , bs , vx , bx , vy , by , nrc );
24102414#endif
24112415}
24122416
2413- void ggml_vec_dot_mxfp4_q8_0_vl128 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2417+ static void ggml_vec_dot_mxfp4_q8_0_vl128 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
24142418 assert (nrc == 1 );
24152419 UNUSED (nrc );
24162420 UNUSED (bx );
@@ -2463,7 +2467,7 @@ void ggml_vec_dot_mxfp4_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, co
24632467 * s = sumf ;
24642468}
24652469
2466- void ggml_vec_dot_mxfp4_q8_0_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2470+ static void ggml_vec_dot_mxfp4_q8_0_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
24672471 assert (nrc == 1 );
24682472 UNUSED (nrc );
24692473 UNUSED (bx );
@@ -2524,16 +2528,18 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
25242528#if defined __riscv_v_intrinsic
25252529 switch (__riscv_vlenb () * 8 ) {
25262530 case 128 :
2527- return ggml_vec_dot_mxfp4_q8_0_vl128 (n , s , bs , vx , bx , vy , by , nrc );
2531+ ggml_vec_dot_mxfp4_q8_0_vl128 (n , s , bs , vx , bx , vy , by , nrc );
2532+ break ;
25282533 default :
2529- return ggml_vec_dot_mxfp4_q8_0_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2534+ ggml_vec_dot_mxfp4_q8_0_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2535+ break ;
25302536 }
25312537#else
25322538 return ggml_vec_dot_mxfp4_q8_0_generic (n , s , bs , vx , bx , vy , by , nrc );
25332539#endif
25342540}
25352541
2536- void ggml_vec_dot_iq4_xs_q8_K_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2542+ static void ggml_vec_dot_iq4_xs_q8_K_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
25372543 assert (nrc == 1 );
25382544 UNUSED (nrc );
25392545 UNUSED (bx );
@@ -2621,11 +2627,211 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
26212627#if defined __riscv_v_intrinsic
26222628 switch (__riscv_vlenb () * 8 ) {
26232629 case 256 :
2624- return ggml_vec_dot_iq4_xs_q8_K_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2630+ ggml_vec_dot_iq4_xs_q8_K_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2631+ break ;
26252632 default :
2626- return ggml_vec_dot_iq4_xs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
2633+ ggml_vec_dot_iq4_xs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
2634+ break ;
26272635 }
26282636#else
2629- return ggml_vec_dot_iq4_xs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
2637+ ggml_vec_dot_iq4_xs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
2638+ #endif
2639+ }
2640+
2641+ static void ggml_vec_dot_iq3_xxs_q8_K_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2642+
2643+ assert (n % QK_K == 0 );
2644+ assert (nrc == 1 );
2645+ UNUSED (nrc );
2646+ UNUSED (bx );
2647+ UNUSED (by );
2648+ UNUSED (bs );
2649+
2650+ const block_iq3_xxs * GGML_RESTRICT x = vx ;
2651+ const block_q8_K * GGML_RESTRICT y = vy ;
2652+ const int nb = n / QK_K ;
2653+
2654+ const uint64_t * signs64 = (const uint64_t * )keven_signs_q2xs ;
2655+ const uint32_t * grid32 = (const uint32_t * )iq3xxs_grid ;
2656+
2657+ // constants for unpacking logic
2658+ const uint32_t shifts_val [8 ] = {0 , 7 , 14 , 21 , 0 , 7 , 14 , 21 };
2659+ vuint32m1_t v_shifts = __riscv_vle32_v_u32m1 (shifts_val , 8 );
2660+
2661+ const uint32_t gather_idx_val [8 ] = {0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 };
2662+ vuint32m1_t v_gather_idx = __riscv_vle32_v_u32m1 (gather_idx_val , 8 );
2663+
2664+ uint32_t aux32 [2 ];
2665+ float sumf = 0.0f ;
2666+
2667+ for (int i = 0 ; i < nb ; ++ i ) {
2668+ const float d = GGML_CPU_FP16_TO_FP32 (x [i ].d ) * y [i ].d ;
2669+
2670+ const uint8_t * GGML_RESTRICT q3_indices = x [i ].qs ;
2671+ const uint8_t * GGML_RESTRICT metadata = x [i ].qs + QK_K /4 ;
2672+ const int8_t * GGML_RESTRICT q8 = y [i ].qs ;
2673+
2674+ float block_sum = 0.0f ;
2675+
2676+ for (int ib = 0 ; ib < QK_K / 64 ; ++ ib ) {
2677+ // Load q8 (64 bytes)
2678+ vint8m2_t v_q8 = __riscv_vle8_v_i8m2 (q8 , 64 );
2679+ q8 += 64 ;
2680+
2681+ // load of metadata via memcpy
2682+ memcpy (aux32 , metadata , 2 * sizeof (uint32_t ));
2683+ metadata += 2 * sizeof (uint32_t );
2684+
2685+ // Load q3 indices and gather magnitudes
2686+ vuint8mf2_t v_q3_idx_u8 = __riscv_vle8_v_u8mf2 (q3_indices , 16 );
2687+ q3_indices += 16 ;
2688+
2689+ vuint16m1_t v_q3_idx_u16 = __riscv_vwmulu_vx_u16m1 (v_q3_idx_u8 , 4 , 16 );
2690+ vuint32m2_t v_q3_magnitudes_u32 = __riscv_vluxei16_v_u32m2 (grid32 , v_q3_idx_u16 , 16 );
2691+ vint8m2_t v_q3_magnitudes = __riscv_vreinterpret_v_u8m2_i8m2 (__riscv_vreinterpret_v_u32m2_u8m2 (v_q3_magnitudes_u32 ));
2692+
2693+ // --- Unpacking of Sign Indices ---
2694+
2695+ // 1. Load the 2 auxiliary 32-bit integers into a vector
2696+ vuint32m1_t v_aux = __riscv_vle32_v_u32m1 (aux32 , 2 );
2697+
2698+ // 2. Broadcast/Gather: replicate aux[0] to first 4 lanes, aux[1] to next 4 lanes
2699+ vuint32m1_t v_aux_expanded = __riscv_vrgather_vv_u32m1 (v_aux , v_gather_idx , 8 );
2700+
2701+ // 3. Apply Shifts and Mask: ((val >> shift) & 127)
2702+ vuint32m1_t v_s_vals_raw = __riscv_vand_vx_u32m1 (__riscv_vsrl_vv_u32m1 (v_aux_expanded , v_shifts , 8 ), 127 , 8 );
2703+
2704+ // 4. Narrow to u16 (required for vluxei index) and multiply by 8 (byte offset for u64 table)
2705+ vuint16mf2_t sign_indices_byte_offset = __riscv_vsll_vx_u16mf2 (__riscv_vncvt_x_x_w_u16mf2 (v_s_vals_raw , 8 ), 3 , 8 );
2706+
2707+ // 5. Gather Signs
2708+ vuint64m2_t v_s_vals_u64 = __riscv_vluxei16_v_u64m2 (signs64 , sign_indices_byte_offset , 8 );
2709+ vint8m2_t v_s_vals = __riscv_vreinterpret_v_u8m2_i8m2 (__riscv_vreinterpret_v_u64m2_u8m2 (v_s_vals_u64 ));
2710+
2711+ vint8m2_t v_q3_signed = __riscv_vmul_vv_i8m2 (v_q3_magnitudes , v_s_vals , 64 );
2712+ vint16m4_t v_dot = __riscv_vwmul_vv_i16m4 (v_q8 , v_q3_signed , 64 );
2713+
2714+ vint16m2_t v_dot_1 = __riscv_vget_v_i16m4_i16m2 (v_dot , 0 );
2715+ vint16m2_t v_dot_2 = __riscv_vget_v_i16m4_i16m2 (v_dot , 1 );
2716+
2717+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1 (0 , 1 );
2718+ vint32m1_t v_sum_1 = __riscv_vwredsum_vs_i16m2_i32m1 (v_dot_1 , v_zero , 32 );
2719+ vint32m1_t v_sum_2 = __riscv_vwredsum_vs_i16m2_i32m1 (v_dot_2 , v_zero , 32 );
2720+
2721+ int32_t sum1_i = __riscv_vmv_x_s_i32m1_i32 (v_sum_1 );
2722+ int32_t sum2_i = __riscv_vmv_x_s_i32m1_i32 (v_sum_2 );
2723+
2724+ const float scale1_f = (float )(2 * (aux32 [0 ] >> 28 ) + 1 );
2725+ const float scale2_f = (float )(2 * (aux32 [1 ] >> 28 ) + 1 );
2726+
2727+ block_sum += sum1_i * scale1_f + sum2_i * scale2_f ;
2728+ }
2729+
2730+ sumf += d * block_sum ;
2731+ }
2732+ * s = 0.25f * sumf ;
2733+ }
2734+
2735+ void ggml_vec_dot_iq3_xxs_q8_K (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2736+ #if defined __riscv_v_intrinsic
2737+ switch (__riscv_vlenb () * 8 ) {
2738+ case 128 :
2739+ return ggml_vec_dot_iq3_xxs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
2740+ default :
2741+ return ggml_vec_dot_iq3_xxs_q8_K_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2742+ }
2743+ #endif
2744+ return ggml_vec_dot_iq3_xxs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
2745+ }
2746+
2747+ static void ggml_vec_dot_iq2_xs_q8_K_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2748+ assert (n % QK_K == 0 );
2749+ assert (nrc == 1 );
2750+ UNUSED (nrc );
2751+ UNUSED (bx );
2752+ UNUSED (by );
2753+ UNUSED (bs );
2754+
2755+ const block_iq2_xs * GGML_RESTRICT x = vx ;
2756+ const block_q8_K * GGML_RESTRICT y = vy ;
2757+
2758+ const int nb = n / QK_K ;
2759+ const uint64_t * signs64 = (const uint64_t * )keven_signs_q2xs ;
2760+ const uint64_t * grid64 = (const uint64_t * )iq2xs_grid ;
2761+
2762+ float sumf = 0.0f ;
2763+
2764+ for (int i = 0 ; i < nb ; ++ i ) {
2765+ const float d = GGML_CPU_FP16_TO_FP32 (x [i ].d ) * y [i ].d ;
2766+ const uint16_t * GGML_RESTRICT qs = x [i ].qs ;
2767+ const int8_t * GGML_RESTRICT q8 = y [i ].qs ;
2768+ const uint8_t * GGML_RESTRICT scales = x [i ].scales ;
2769+
2770+ int32_t sum_int = 0 ;
2771+
2772+ // Loop over 4 subblocks of 64 elements (QK_K = 256)
2773+ for (int ib64 = 0 ; ib64 < QK_K / 64 ; ++ ib64 ) {
2774+ // Load 8 uint16 indices (controls 64 values)
2775+ vuint16mf2_t v_qs = __riscv_vle16_v_u16mf2 (qs , 8 );
2776+ qs += 8 ;
2777+
2778+ // Extract indices for grid (low 9 bits) and signs (high 7 bits)
2779+ // Multiply by 8 (<< 3) for byte offsets into the uint64 tables
2780+ vuint16mf2_t vidx_grid = __riscv_vsll_vx_u16mf2 (__riscv_vand_vx_u16mf2 (v_qs , 511 , 8 ), 3 , 8 );
2781+ vuint16mf2_t vidx_sign = __riscv_vsll_vx_u16mf2 (__riscv_vsrl_vx_u16mf2 (v_qs , 9 , 8 ), 3 , 8 );
2782+
2783+ vuint64m2_t vq2_64 = __riscv_vluxei16_v_u64m2 (grid64 , vidx_grid , 8 );
2784+ vuint64m2_t vs2_64 = __riscv_vluxei16_v_u64m2 (signs64 , vidx_sign , 8 );
2785+
2786+ vint8m2_t q2u = __riscv_vreinterpret_v_u8m2_i8m2 (__riscv_vreinterpret_v_u64m2_u8m2 (vq2_64 ));
2787+ vint8m2_t q2s = __riscv_vreinterpret_v_u8m2_i8m2 (__riscv_vreinterpret_v_u64m2_u8m2 (vs2_64 ));
2788+
2789+ // Apply signs
2790+ vint8m2_t q2_final = __riscv_vmul_vv_i8m2 (q2u , q2s , 64 );
2791+
2792+ // Load Q8 weights (64 elements)
2793+ vint8m2_t q8v = __riscv_vle8_v_i8m2 (q8 , 64 );
2794+ q8 += 64 ;
2795+
2796+ // Multiply (Widening to int16, 64 elements -> LMUL=4)
2797+ vint16m4_t prod = __riscv_vwmul_vv_i16m4 (q2_final , q8v , 64 );
2798+
2799+ // Reduction
2800+ vint32m1_t zero_vec = __riscv_vmv_v_x_i32m1 (0 , 1 );
2801+
2802+ int32_t sum0 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (
2803+ __riscv_vget_v_i16m4_i16m1 (prod , 0 ), zero_vec , 16 ));
2804+ int32_t sum1 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (
2805+ __riscv_vget_v_i16m4_i16m1 (prod , 1 ), zero_vec , 16 ));
2806+ int32_t sum2 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (
2807+ __riscv_vget_v_i16m4_i16m1 (prod , 2 ), zero_vec , 16 ));
2808+ int32_t sum3 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (
2809+ __riscv_vget_v_i16m4_i16m1 (prod , 3 ), zero_vec , 16 ));
2810+
2811+ // Apply Scales
2812+ const uint8_t scale_byte_1 = scales [0 ];
2813+ const uint8_t scale_byte_2 = scales [1 ];
2814+ scales += 2 ;
2815+
2816+ sum_int += sum0 * ((scale_byte_1 & 0x0F ) * 2 + 1 );
2817+ sum_int += sum1 * ((scale_byte_1 >> 4 ) * 2 + 1 );
2818+ sum_int += sum2 * ((scale_byte_2 & 0x0F ) * 2 + 1 );
2819+ sum_int += sum3 * ((scale_byte_2 >> 4 ) * 2 + 1 );
2820+ }
2821+
2822+ sumf += d * sum_int ;
2823+ }
2824+ * s = 0.125f * sumf ;
2825+ }
2826+
2827+ void ggml_vec_dot_iq2_xs_q8_K (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2828+ #if defined __riscv_v_intrinsic
2829+ switch (__riscv_vlenb () * 8 ) {
2830+ case 128 :
2831+ return ggml_vec_dot_iq2_xs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
2832+ default :
2833+ return ggml_vec_dot_iq2_xs_q8_K_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2834+ }
26302835#endif
2836+ return ggml_vec_dot_iq2_xs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
26312837}
0 commit comments