Skip to content

Commit 91cf5ad

Browse files
committed
ggml-cpu: add repack GEMM and GEMV for floating-point
1 parent 1191758 commit 91cf5ad

4 files changed

Lines changed: 883 additions & 50 deletions

File tree

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
3939
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
4040
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
41+
#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1
42+
#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1
4143
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
4244
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
4345
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
@@ -77,15 +79,33 @@
7779
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
7880
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
7981
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
82+
#define ggml_gemv_f16_1x16_f16_generic ggml_gemv_f16_1x16_f16
83+
#define ggml_gemv_f16_1x32_f16_generic ggml_gemv_f16_1x32_f16
84+
#define ggml_gemv_f16_1x64_f16_generic ggml_gemv_f16_1x64_f16
85+
#define ggml_gemv_f16_1x128_f16_generic ggml_gemv_f16_1x128_f16
86+
#define ggml_gemv_f32_1x16_f32_generic ggml_gemv_f32_1x16_f32
87+
#define ggml_gemv_f32_1x32_f32_generic ggml_gemv_f32_1x32_f32
88+
#define ggml_gemv_f32_1x64_f32_generic ggml_gemv_f32_1x64_f32
89+
#define ggml_gemv_f32_1x128_f32_generic ggml_gemv_f32_1x128_f32
8090
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
8191
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
8292
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
93+
#define ggml_gemm_f16_7x1x16_f16_generic ggml_gemm_f16_7x1x16_f16
94+
#define ggml_gemm_f16_7x1x32_f16_generic ggml_gemm_f16_7x1x32_f16
95+
#define ggml_gemm_f16_7x1x64_f16_generic ggml_gemm_f16_7x1x64_f16
96+
#define ggml_gemm_f16_7x1x128_f16_generic ggml_gemm_f16_7x1x128_f16
97+
#define ggml_gemm_f32_7x1x16_f32_generic ggml_gemm_f32_7x1x16_f32
98+
#define ggml_gemm_f32_7x1x32_f32_generic ggml_gemm_f32_7x1x32_f32
99+
#define ggml_gemm_f32_7x1x64_f32_generic ggml_gemm_f32_7x1x64_f32
100+
#define ggml_gemm_f32_7x1x128_f32_generic ggml_gemm_f32_7x1x128_f32
83101
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
84102
// quants.c
85103
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
86104
// repack.cpp
87105
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
88106
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
107+
#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1
108+
#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1
89109
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
90110
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
91111
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
@@ -122,6 +142,8 @@
122142
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
123143
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
124144
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
145+
#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1
146+
#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1
125147
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
126148
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
127149
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
@@ -168,6 +190,8 @@
168190
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
169191
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
170192
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
193+
#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1
194+
#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1
171195
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
172196
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
173197
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
@@ -209,6 +233,8 @@
209233
#define ggml_quantize_mat_q8_K_4x1_generic ggml_quantize_mat_q8_K_4x1
210234
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
211235
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
236+
#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1
237+
#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1
212238
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
213239
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
214240
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
@@ -259,6 +285,8 @@
259285
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
260286
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
261287
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
288+
#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1
289+
#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1
262290
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
263291
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
264292
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
@@ -312,6 +340,8 @@
312340
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
313341
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
314342
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
343+
#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1
344+
#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1
315345
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
316346
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
317347
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0

ggml/src/ggml-cpu/arch/riscv/repack.cpp

Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1701,3 +1701,277 @@ void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
17011701
}
17021702
}
17031703
#endif
1704+
1705+
template<int ncols_interleaved>
1706+
static inline void ggml_gemv_f16_1xM_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1707+
const int nb = n / 1;
1708+
1709+
assert (nr == 1);
1710+
assert(n % 1 == 0);
1711+
assert(nc % ncols_interleaved == 0);
1712+
1713+
const _Float16 * a_ptr = (const _Float16 *) vy;
1714+
for (int x = 0; x < nc / ncols_interleaved; x++) {
1715+
const block_f16<ncols_interleaved, 1> * b_ptr = (const block_f16<ncols_interleaved, 1> *) vx + (x * nb);
1716+
1717+
// Accumulators
1718+
vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1719+
1720+
for (int l = 0; l < nb; l++) {
1721+
vfloat16m2_t b_0 = __riscv_vle16_v_f16m2((const _Float16 *)&b_ptr[l].d[0], ncols_interleaved);
1722+
1723+
sumf_0 = __riscv_vfwmacc_vf_f32m4(sumf_0, *(const _Float16*)(&a_ptr[l]), b_0, ncols_interleaved);
1724+
}
1725+
1726+
__riscv_vse32_v_f32m4(&s[x * ncols_interleaved], sumf_0, ncols_interleaved);
1727+
}
1728+
1729+
return;
1730+
}
1731+
1732+
void ggml_gemv_f16_1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1733+
#if defined __riscv_v_intrinsic
1734+
ggml_gemv_f16_1xM_f16<16>(n, s, bs, vx, vy, nr, nc);
1735+
return;
1736+
#endif
1737+
ggml_gemv_f16_1x16_f16_generic(n, s, bs, vx, vy, nr, nc);
1738+
}
1739+
1740+
void ggml_gemv_f16_1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1741+
#if defined __riscv_v_intrinsic
1742+
ggml_gemv_f16_1xM_f16<32>(n, s, bs, vx, vy, nr, nc);
1743+
return;
1744+
#endif
1745+
ggml_gemv_f16_1x32_f16_generic(n, s, bs, vx, vy, nr, nc);
1746+
}
1747+
1748+
void ggml_gemv_f16_1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1749+
#if defined __riscv_v_intrinsic
1750+
ggml_gemv_f16_1xM_f16<64>(n, s, bs, vx, vy, nr, nc);
1751+
return;
1752+
#endif
1753+
ggml_gemv_f16_1x64_f16_generic(n, s, bs, vx, vy, nr, nc);
1754+
}
1755+
1756+
void ggml_gemv_f16_1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1757+
#if defined __riscv_v_intrinsic
1758+
ggml_gemv_f16_1xM_f16<128>(n, s, bs, vx, vy, nr, nc);
1759+
return;
1760+
#endif
1761+
ggml_gemv_f16_1x128_f16_generic(n, s, bs, vx, vy, nr, nc);
1762+
}
1763+
1764+
template<int ncols_interleaved>
1765+
static inline void ggml_gemv_f32_1xM_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1766+
const int nb = n / 1;
1767+
1768+
assert (nr == 1);
1769+
assert(n % 1 == 0);
1770+
assert(nc % ncols_interleaved == 0);
1771+
1772+
const float * a_ptr = (const float *) vy;
1773+
for (int x = 0; x < nc / ncols_interleaved; x++) {
1774+
const block_f32<ncols_interleaved, 1> * b_ptr = (const block_f32<ncols_interleaved, 1> *) vx + (x * nb);
1775+
1776+
// Accumulators
1777+
vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1778+
1779+
for (int l = 0; l < nb; l++) {
1780+
vfloat32m4_t b_0 = __riscv_vle32_v_f32m4((const float *)&b_ptr[l].d[0], ncols_interleaved);
1781+
1782+
sumf_0 = __riscv_vfmacc_vf_f32m4(sumf_0, *(const float*)(&a_ptr[l]), b_0, ncols_interleaved);
1783+
}
1784+
1785+
__riscv_vse32_v_f32m4(&s[x * ncols_interleaved], sumf_0, ncols_interleaved);
1786+
}
1787+
1788+
return;
1789+
}
1790+
1791+
void ggml_gemv_f32_1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1792+
#if defined __riscv_v_intrinsic
1793+
ggml_gemv_f32_1xM_f32<16>(n, s, bs, vx, vy, nr, nc);
1794+
return;
1795+
#endif
1796+
ggml_gemv_f32_1x16_f32_generic(n, s, bs, vx, vy, nr, nc);
1797+
}
1798+
1799+
void ggml_gemv_f32_1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1800+
#if defined __riscv_v_intrinsic
1801+
ggml_gemv_f32_1xM_f32<32>(n, s, bs, vx, vy, nr, nc);
1802+
return;
1803+
#endif
1804+
ggml_gemv_f32_1x32_f32_generic(n, s, bs, vx, vy, nr, nc);
1805+
}
1806+
1807+
void ggml_gemv_f32_1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1808+
#if defined __riscv_v_intrinsic
1809+
ggml_gemv_f32_1xM_f32<64>(n, s, bs, vx, vy, nr, nc);
1810+
return;
1811+
#endif
1812+
ggml_gemv_f32_1x64_f32_generic(n, s, bs, vx, vy, nr, nc);
1813+
}
1814+
1815+
void ggml_gemv_f32_1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1816+
#if defined __riscv_v_intrinsic
1817+
ggml_gemv_f32_1xM_f32<128>(n, s, bs, vx, vy, nr, nc);
1818+
return;
1819+
#endif
1820+
ggml_gemv_f32_1x128_f32_generic(n, s, bs, vx, vy, nr, nc);
1821+
}
1822+
1823+
template<int ncols_interleaved>
1824+
static inline void ggml_gemm_f16_7x1xM_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1825+
const int nb = n / 1;
1826+
1827+
assert (nr % 7 == 0);
1828+
assert(n % 1 == 0);
1829+
assert(nc % ncols_interleaved == 0);
1830+
1831+
for (int y = 0; y < nr / 7; y++) {
1832+
const block_f16_7x1 * a_ptr = (const block_f16_7x1*) vy + (y * nb);
1833+
for (int x = 0; x < nc / ncols_interleaved; x++) {
1834+
const block_f16<ncols_interleaved, 1> * b_ptr = (const block_f16<ncols_interleaved, 1> *) vx + (x * nb);
1835+
1836+
// Accumulators
1837+
vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1838+
vfloat32m4_t sumf_1 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1839+
vfloat32m4_t sumf_2 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1840+
vfloat32m4_t sumf_3 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1841+
vfloat32m4_t sumf_4 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1842+
vfloat32m4_t sumf_5 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1843+
vfloat32m4_t sumf_6 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1844+
1845+
for (int l = 0; l < nb; l++) {
1846+
vfloat16m2_t b_0 = __riscv_vle16_v_f16m2((const _Float16 *)&b_ptr[l].d[0], ncols_interleaved);
1847+
1848+
sumf_0 = __riscv_vfwmacc_vf_f32m4(sumf_0, *(const _Float16*)&a_ptr[l].d[0], b_0, ncols_interleaved);
1849+
sumf_1 = __riscv_vfwmacc_vf_f32m4(sumf_1, *(const _Float16*)&a_ptr[l].d[1], b_0, ncols_interleaved);
1850+
sumf_2 = __riscv_vfwmacc_vf_f32m4(sumf_2, *(const _Float16*)&a_ptr[l].d[2], b_0, ncols_interleaved);
1851+
sumf_3 = __riscv_vfwmacc_vf_f32m4(sumf_3, *(const _Float16*)&a_ptr[l].d[3], b_0, ncols_interleaved);
1852+
sumf_4 = __riscv_vfwmacc_vf_f32m4(sumf_4, *(const _Float16*)&a_ptr[l].d[4], b_0, ncols_interleaved);
1853+
sumf_5 = __riscv_vfwmacc_vf_f32m4(sumf_5, *(const _Float16*)&a_ptr[l].d[5], b_0, ncols_interleaved);
1854+
sumf_6 = __riscv_vfwmacc_vf_f32m4(sumf_6, *(const _Float16*)&a_ptr[l].d[6], b_0, ncols_interleaved);
1855+
}
1856+
1857+
__riscv_vse32_v_f32m4(&s[(y * 7 + 0) * bs + x * ncols_interleaved], sumf_0, ncols_interleaved);
1858+
__riscv_vse32_v_f32m4(&s[(y * 7 + 1) * bs + x * ncols_interleaved], sumf_1, ncols_interleaved);
1859+
__riscv_vse32_v_f32m4(&s[(y * 7 + 2) * bs + x * ncols_interleaved], sumf_2, ncols_interleaved);
1860+
__riscv_vse32_v_f32m4(&s[(y * 7 + 3) * bs + x * ncols_interleaved], sumf_3, ncols_interleaved);
1861+
__riscv_vse32_v_f32m4(&s[(y * 7 + 4) * bs + x * ncols_interleaved], sumf_4, ncols_interleaved);
1862+
__riscv_vse32_v_f32m4(&s[(y * 7 + 5) * bs + x * ncols_interleaved], sumf_5, ncols_interleaved);
1863+
__riscv_vse32_v_f32m4(&s[(y * 7 + 6) * bs + x * ncols_interleaved], sumf_6, ncols_interleaved);
1864+
}
1865+
}
1866+
return;
1867+
}
1868+
1869+
void ggml_gemm_f16_7x1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1870+
#if defined __riscv_v_intrinsic
1871+
ggml_gemm_f16_7x1xM_f16<16>(n, s, bs, vx, vy, nr, nc);
1872+
return;
1873+
#endif
1874+
ggml_gemm_f16_7x1x16_f16_generic(n, s, bs, vx, vy, nr, nc);
1875+
}
1876+
1877+
void ggml_gemm_f16_7x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1878+
#if defined __riscv_v_intrinsic
1879+
ggml_gemm_f16_7x1xM_f16<32>(n, s, bs, vx, vy, nr, nc);
1880+
return;
1881+
#endif
1882+
ggml_gemm_f16_7x1x32_f16_generic(n, s, bs, vx, vy, nr, nc);
1883+
}
1884+
1885+
void ggml_gemm_f16_7x1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1886+
#if defined __riscv_v_intrinsic
1887+
ggml_gemm_f16_7x1xM_f16<64>(n, s, bs, vx, vy, nr, nc);
1888+
return;
1889+
#endif
1890+
ggml_gemm_f16_7x1x64_f16_generic(n, s, bs, vx, vy, nr, nc);
1891+
}
1892+
1893+
void ggml_gemm_f16_7x1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1894+
#if defined __riscv_v_intrinsic
1895+
ggml_gemm_f16_7x1xM_f16<128>(n, s, bs, vx, vy, nr, nc);
1896+
return;
1897+
#endif
1898+
ggml_gemm_f16_7x1x128_f16_generic(n, s, bs, vx, vy, nr, nc);
1899+
}
1900+
1901+
template<int ncols_interleaved>
1902+
static inline void ggml_gemm_f32_7x1xM_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1903+
const int nb = n / 1;
1904+
1905+
assert (nr % 7 == 0);
1906+
assert(n % 1 == 0);
1907+
assert(nc % ncols_interleaved == 0);
1908+
1909+
for (int y = 0; y < nr / 7; y++) {
1910+
const block_f32_7x1 * a_ptr = (const block_f32_7x1*) vy + (y * nb);
1911+
for (int x = 0; x < nc / ncols_interleaved; x++) {
1912+
const block_f32<ncols_interleaved, 1> * b_ptr = (const block_f32<ncols_interleaved, 1> *) vx + (x * nb);
1913+
1914+
// Accumulators
1915+
vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1916+
vfloat32m4_t sumf_1 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1917+
vfloat32m4_t sumf_2 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1918+
vfloat32m4_t sumf_3 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1919+
vfloat32m4_t sumf_4 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1920+
vfloat32m4_t sumf_5 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1921+
vfloat32m4_t sumf_6 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved);
1922+
1923+
for (int l = 0; l < nb; l++) {
1924+
vfloat32m4_t b_0 = __riscv_vle32_v_f32m4((const float*)&b_ptr[l].d[0], ncols_interleaved);
1925+
1926+
sumf_0 = __riscv_vfmacc_vf_f32m4(sumf_0, *(const float*)&a_ptr[l].d[0], b_0, ncols_interleaved);
1927+
sumf_1 = __riscv_vfmacc_vf_f32m4(sumf_1, *(const float*)&a_ptr[l].d[1], b_0, ncols_interleaved);
1928+
sumf_2 = __riscv_vfmacc_vf_f32m4(sumf_2, *(const float*)&a_ptr[l].d[2], b_0, ncols_interleaved);
1929+
sumf_3 = __riscv_vfmacc_vf_f32m4(sumf_3, *(const float*)&a_ptr[l].d[3], b_0, ncols_interleaved);
1930+
sumf_4 = __riscv_vfmacc_vf_f32m4(sumf_4, *(const float*)&a_ptr[l].d[4], b_0, ncols_interleaved);
1931+
sumf_5 = __riscv_vfmacc_vf_f32m4(sumf_5, *(const float*)&a_ptr[l].d[5], b_0, ncols_interleaved);
1932+
sumf_6 = __riscv_vfmacc_vf_f32m4(sumf_6, *(const float*)&a_ptr[l].d[6], b_0, ncols_interleaved);
1933+
}
1934+
1935+
__riscv_vse32_v_f32m4(&s[(y * 7 + 0) * bs + x * ncols_interleaved], sumf_0, ncols_interleaved);
1936+
__riscv_vse32_v_f32m4(&s[(y * 7 + 1) * bs + x * ncols_interleaved], sumf_1, ncols_interleaved);
1937+
__riscv_vse32_v_f32m4(&s[(y * 7 + 2) * bs + x * ncols_interleaved], sumf_2, ncols_interleaved);
1938+
__riscv_vse32_v_f32m4(&s[(y * 7 + 3) * bs + x * ncols_interleaved], sumf_3, ncols_interleaved);
1939+
__riscv_vse32_v_f32m4(&s[(y * 7 + 4) * bs + x * ncols_interleaved], sumf_4, ncols_interleaved);
1940+
__riscv_vse32_v_f32m4(&s[(y * 7 + 5) * bs + x * ncols_interleaved], sumf_5, ncols_interleaved);
1941+
__riscv_vse32_v_f32m4(&s[(y * 7 + 6) * bs + x * ncols_interleaved], sumf_6, ncols_interleaved);
1942+
}
1943+
}
1944+
return;
1945+
}
1946+
1947+
void ggml_gemm_f32_7x1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1948+
#if defined __riscv_v_intrinsic
1949+
ggml_gemm_f32_7x1xM_f32<16>(n, s, bs, vx, vy, nr, nc);
1950+
return;
1951+
#endif
1952+
ggml_gemm_f32_7x1x16_f32_generic(n, s, bs, vx, vy, nr, nc);
1953+
}
1954+
1955+
void ggml_gemm_f32_7x1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1956+
#if defined __riscv_v_intrinsic
1957+
ggml_gemm_f32_7x1xM_f32<32>(n, s, bs, vx, vy, nr, nc);
1958+
return;
1959+
#endif
1960+
ggml_gemm_f32_7x1x32_f32_generic(n, s, bs, vx, vy, nr, nc);
1961+
}
1962+
1963+
void ggml_gemm_f32_7x1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1964+
#if defined __riscv_v_intrinsic
1965+
ggml_gemm_f32_7x1xM_f32<64>(n, s, bs, vx, vy, nr, nc);
1966+
return;
1967+
#endif
1968+
ggml_gemm_f32_7x1x64_f32_generic(n, s, bs, vx, vy, nr, nc);
1969+
}
1970+
1971+
void ggml_gemm_f32_7x1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1972+
#if defined __riscv_v_intrinsic
1973+
ggml_gemm_f32_7x1xM_f32<128>(n, s, bs, vx, vy, nr, nc);
1974+
return;
1975+
#endif
1976+
ggml_gemm_f32_7x1x128_f32_generic(n, s, bs, vx, vy, nr, nc);
1977+
}

0 commit comments

Comments
 (0)