Skip to content

Commit e56f607

Browse files
ggml-cpu: add rvv ggml_quantize_mat_4x8 for q8_0
Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>
1 parent cd78e57 commit e56f607

1 file changed

Lines changed: 88 additions & 0 deletions

File tree

ggml/src/ggml-cpu/arch/riscv/repack.cpp

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,94 @@
2424

2525
#define UNUSED GGML_UNUSED
2626

27+
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
28+
assert(QK8_0 == 32);
29+
assert(k % QK8_0 == 0);
30+
const int nb = k / QK8_0;
31+
32+
#if defined(__riscv_v_intrinsic)
33+
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
34+
const size_t vl_calc = __riscv_vsetvl_e32m8(QK8_0);
35+
const size_t vl_save = __riscv_vsetvl_e64m2(4);
36+
vfloat32m1_t v_scalar_zero = __riscv_vfmv_s_f_f32m1(0.0f, __riscv_vsetvl_e32m1(1));
37+
38+
for (int i = 0; i < nb; i++) {
39+
const float *x_block_base = x + i * QK8_0;
40+
vint8m2_t q_r0, q_r1, q_r2, q_r3;
41+
{
42+
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 0 * k, vl_calc);
43+
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
44+
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
45+
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
46+
47+
float d = amax / 127.0f;
48+
y[i].d[0] = GGML_CPU_FP32_TO_FP16(d);
49+
50+
float id = d ? 1.0f / d : 0.0f;
51+
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
52+
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
53+
q_r0 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
54+
}
55+
asm volatile ("" ::: "memory");
56+
57+
{
58+
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 1 * k, vl_calc);
59+
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
60+
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
61+
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
62+
63+
float d = amax / 127.0f;
64+
y[i].d[1] = GGML_CPU_FP32_TO_FP16(d);
65+
float id = d ? 1.0f / d : 0.0f;
66+
67+
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
68+
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
69+
q_r1 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
70+
}
71+
asm volatile ("" ::: "memory");
72+
{
73+
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 2 * k, vl_calc);
74+
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
75+
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
76+
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
77+
78+
float d = amax / 127.0f;
79+
y[i].d[2] = GGML_CPU_FP32_TO_FP16(d);
80+
float id = d ? 1.0f / d : 0.0f;
81+
82+
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
83+
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
84+
q_r2 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
85+
}
86+
asm volatile ("" ::: "memory");
87+
{
88+
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 3 * k, vl_calc);
89+
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
90+
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
91+
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
92+
93+
float d = amax / 127.0f;
94+
y[i].d[3] = GGML_CPU_FP32_TO_FP16(d);
95+
float id = d ? 1.0f / d : 0.0f;
96+
97+
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
98+
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
99+
q_r3 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
100+
}
101+
vint64m2_t v_q64_r0 = __riscv_vreinterpret_v_i8m2_i64m2(q_r0);
102+
vint64m2_t v_q64_r1 = __riscv_vreinterpret_v_i8m2_i64m2(q_r1);
103+
vint64m2_t v_q64_r2 = __riscv_vreinterpret_v_i8m2_i64m2(q_r2);
104+
vint64m2_t v_q64_r3 = __riscv_vreinterpret_v_i8m2_i64m2(q_r3);
105+
vint64m2x4_t v_quant_tuple = __riscv_vcreate_v_i64m2x4(v_q64_r0, v_q64_r1, v_q64_r2, v_q64_r3);
106+
__riscv_vsseg4e64_v_i64m2x4((int64_t*)y[i].qs, v_quant_tuple, vl_save);
107+
}
108+
#else
109+
UNUSED(nb);
110+
UNUSED(y);
111+
ggml_quantize_mat_q8_0_4x4_generic(x, vy, k);
112+
#endif
113+
}
114+
27115
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
28116
const int qk = QK8_0;
29117
const int nb = n / qk;

0 commit comments

Comments
 (0)