|
24 | 24 |
|
25 | 25 | #define UNUSED GGML_UNUSED |
26 | 26 |
|
| 27 | +void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
| 28 | + assert(QK8_0 == 32); |
| 29 | + assert(k % QK8_0 == 0); |
| 30 | + const int nb = k / QK8_0; |
| 31 | + |
| 32 | +#if defined(__riscv_v_intrinsic) |
| 33 | + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; |
| 34 | + const size_t vl_calc = __riscv_vsetvl_e32m8(QK8_0); |
| 35 | + const size_t vl_save = __riscv_vsetvl_e64m2(4); |
| 36 | + vfloat32m1_t v_scalar_zero = __riscv_vfmv_s_f_f32m1(0.0f, __riscv_vsetvl_e32m1(1)); |
| 37 | + |
| 38 | + for (int i = 0; i < nb; i++) { |
| 39 | + const float *x_block_base = x + i * QK8_0; |
| 40 | + vint8m2_t q_r0, q_r1, q_r2, q_r3; |
| 41 | + { |
| 42 | + vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 0 * k, vl_calc); |
| 43 | + vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc); |
| 44 | + vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc); |
| 45 | + float amax = __riscv_vfmv_f_s_f32m1_f32(v_max); |
| 46 | + |
| 47 | + float d = amax / 127.0f; |
| 48 | + y[i].d[0] = GGML_CPU_FP32_TO_FP16(d); |
| 49 | + |
| 50 | + float id = d ? 1.0f / d : 0.0f; |
| 51 | + vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc); |
| 52 | + vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc); |
| 53 | + q_r0 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc); |
| 54 | + } |
| 55 | + asm volatile ("" ::: "memory"); |
| 56 | + |
| 57 | + { |
| 58 | + vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 1 * k, vl_calc); |
| 59 | + vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc); |
| 60 | + vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc); |
| 61 | + float amax = __riscv_vfmv_f_s_f32m1_f32(v_max); |
| 62 | + |
| 63 | + float d = amax / 127.0f; |
| 64 | + y[i].d[1] = GGML_CPU_FP32_TO_FP16(d); |
| 65 | + float id = d ? 1.0f / d : 0.0f; |
| 66 | + |
| 67 | + vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc); |
| 68 | + vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc); |
| 69 | + q_r1 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc); |
| 70 | + } |
| 71 | + asm volatile ("" ::: "memory"); |
| 72 | + { |
| 73 | + vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 2 * k, vl_calc); |
| 74 | + vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc); |
| 75 | + vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc); |
| 76 | + float amax = __riscv_vfmv_f_s_f32m1_f32(v_max); |
| 77 | + |
| 78 | + float d = amax / 127.0f; |
| 79 | + y[i].d[2] = GGML_CPU_FP32_TO_FP16(d); |
| 80 | + float id = d ? 1.0f / d : 0.0f; |
| 81 | + |
| 82 | + vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc); |
| 83 | + vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc); |
| 84 | + q_r2 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc); |
| 85 | + } |
| 86 | + asm volatile ("" ::: "memory"); |
| 87 | + { |
| 88 | + vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 3 * k, vl_calc); |
| 89 | + vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc); |
| 90 | + vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc); |
| 91 | + float amax = __riscv_vfmv_f_s_f32m1_f32(v_max); |
| 92 | + |
| 93 | + float d = amax / 127.0f; |
| 94 | + y[i].d[3] = GGML_CPU_FP32_TO_FP16(d); |
| 95 | + float id = d ? 1.0f / d : 0.0f; |
| 96 | + |
| 97 | + vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc); |
| 98 | + vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc); |
| 99 | + q_r3 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc); |
| 100 | + } |
| 101 | + vint64m2_t v_q64_r0 = __riscv_vreinterpret_v_i8m2_i64m2(q_r0); |
| 102 | + vint64m2_t v_q64_r1 = __riscv_vreinterpret_v_i8m2_i64m2(q_r1); |
| 103 | + vint64m2_t v_q64_r2 = __riscv_vreinterpret_v_i8m2_i64m2(q_r2); |
| 104 | + vint64m2_t v_q64_r3 = __riscv_vreinterpret_v_i8m2_i64m2(q_r3); |
| 105 | + vint64m2x4_t v_quant_tuple = __riscv_vcreate_v_i64m2x4(v_q64_r0, v_q64_r1, v_q64_r2, v_q64_r3); |
| 106 | + __riscv_vsseg4e64_v_i64m2x4((int64_t*)y[i].qs, v_quant_tuple, vl_save); |
| 107 | + } |
| 108 | +#else |
| 109 | + UNUSED(nb); |
| 110 | + UNUSED(y); |
| 111 | + ggml_quantize_mat_q8_0_4x4_generic(x, vy, k); |
| 112 | +#endif |
| 113 | +} |
| 114 | + |
27 | 115 | void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
28 | 116 | const int qk = QK8_0; |
29 | 117 | const int nb = n / qk; |
|
0 commit comments