@@ -1701,3 +1701,277 @@ void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
17011701 }
17021702}
17031703#endif
1704+
1705+ template <int ncols_interleaved>
1706+ static inline void ggml_gemv_f16_1xM_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1707+ const int nb = n / 1 ;
1708+
1709+ assert (nr == 1 );
1710+ assert (n % 1 == 0 );
1711+ assert (nc % ncols_interleaved == 0 );
1712+
1713+ const _Float16 * a_ptr = (const _Float16 *) vy;
1714+ for (int x = 0 ; x < nc / ncols_interleaved; x++) {
1715+ const block_f16<ncols_interleaved, 1 > * b_ptr = (const block_f16<ncols_interleaved, 1 > *) vx + (x * nb);
1716+
1717+ // Accumulators
1718+ vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1719+
1720+ for (int l = 0 ; l < nb; l++) {
1721+ vfloat16m2_t b_0 = __riscv_vle16_v_f16m2 ((const _Float16 *)&b_ptr[l].d [0 ], ncols_interleaved);
1722+
1723+ sumf_0 = __riscv_vfwmacc_vf_f32m4 (sumf_0, *(const _Float16*)(&a_ptr[l]), b_0, ncols_interleaved);
1724+ }
1725+
1726+ __riscv_vse32_v_f32m4 (&s[x * ncols_interleaved], sumf_0, ncols_interleaved);
1727+ }
1728+
1729+ return ;
1730+ }
1731+
1732+ void ggml_gemv_f16_1x16_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1733+ #if defined __riscv_v_intrinsic
1734+ ggml_gemv_f16_1xM_f16<16 >(n, s, bs, vx, vy, nr, nc);
1735+ return ;
1736+ #endif
1737+ ggml_gemv_f16_1x16_f16_generic (n, s, bs, vx, vy, nr, nc);
1738+ }
1739+
1740+ void ggml_gemv_f16_1x32_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1741+ #if defined __riscv_v_intrinsic
1742+ ggml_gemv_f16_1xM_f16<32 >(n, s, bs, vx, vy, nr, nc);
1743+ return ;
1744+ #endif
1745+ ggml_gemv_f16_1x32_f16_generic (n, s, bs, vx, vy, nr, nc);
1746+ }
1747+
1748+ void ggml_gemv_f16_1x64_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1749+ #if defined __riscv_v_intrinsic
1750+ ggml_gemv_f16_1xM_f16<64 >(n, s, bs, vx, vy, nr, nc);
1751+ return ;
1752+ #endif
1753+ ggml_gemv_f16_1x64_f16_generic (n, s, bs, vx, vy, nr, nc);
1754+ }
1755+
1756+ void ggml_gemv_f16_1x128_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1757+ #if defined __riscv_v_intrinsic
1758+ ggml_gemv_f16_1xM_f16<128 >(n, s, bs, vx, vy, nr, nc);
1759+ return ;
1760+ #endif
1761+ ggml_gemv_f16_1x128_f16_generic (n, s, bs, vx, vy, nr, nc);
1762+ }
1763+
1764+ template <int ncols_interleaved>
1765+ static inline void ggml_gemv_f32_1xM_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1766+ const int nb = n / 1 ;
1767+
1768+ assert (nr == 1 );
1769+ assert (n % 1 == 0 );
1770+ assert (nc % ncols_interleaved == 0 );
1771+
1772+ const float * a_ptr = (const float *) vy;
1773+ for (int x = 0 ; x < nc / ncols_interleaved; x++) {
1774+ const block_f32<ncols_interleaved, 1 > * b_ptr = (const block_f32<ncols_interleaved, 1 > *) vx + (x * nb);
1775+
1776+ // Accumulators
1777+ vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1778+
1779+ for (int l = 0 ; l < nb; l++) {
1780+ vfloat32m4_t b_0 = __riscv_vle32_v_f32m4 ((const float *)&b_ptr[l].d [0 ], ncols_interleaved);
1781+
1782+ sumf_0 = __riscv_vfmacc_vf_f32m4 (sumf_0, *(const float *)(&a_ptr[l]), b_0, ncols_interleaved);
1783+ }
1784+
1785+ __riscv_vse32_v_f32m4 (&s[x * ncols_interleaved], sumf_0, ncols_interleaved);
1786+ }
1787+
1788+ return ;
1789+ }
1790+
1791+ void ggml_gemv_f32_1x16_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1792+ #if defined __riscv_v_intrinsic
1793+ ggml_gemv_f32_1xM_f32<16 >(n, s, bs, vx, vy, nr, nc);
1794+ return ;
1795+ #endif
1796+ ggml_gemv_f32_1x16_f32_generic (n, s, bs, vx, vy, nr, nc);
1797+ }
1798+
1799+ void ggml_gemv_f32_1x32_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1800+ #if defined __riscv_v_intrinsic
1801+ ggml_gemv_f32_1xM_f32<32 >(n, s, bs, vx, vy, nr, nc);
1802+ return ;
1803+ #endif
1804+ ggml_gemv_f32_1x32_f32_generic (n, s, bs, vx, vy, nr, nc);
1805+ }
1806+
1807+ void ggml_gemv_f32_1x64_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1808+ #if defined __riscv_v_intrinsic
1809+ ggml_gemv_f32_1xM_f32<64 >(n, s, bs, vx, vy, nr, nc);
1810+ return ;
1811+ #endif
1812+ ggml_gemv_f32_1x64_f32_generic (n, s, bs, vx, vy, nr, nc);
1813+ }
1814+
1815+ void ggml_gemv_f32_1x128_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1816+ #if defined __riscv_v_intrinsic
1817+ ggml_gemv_f32_1xM_f32<128 >(n, s, bs, vx, vy, nr, nc);
1818+ return ;
1819+ #endif
1820+ ggml_gemv_f32_1x128_f32_generic (n, s, bs, vx, vy, nr, nc);
1821+ }
1822+
1823+ template <int ncols_interleaved>
1824+ static inline void ggml_gemm_f16_7x1xM_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1825+ const int nb = n / 1 ;
1826+
1827+ assert (nr % 7 == 0 );
1828+ assert (n % 1 == 0 );
1829+ assert (nc % ncols_interleaved == 0 );
1830+
1831+ for (int y = 0 ; y < nr / 7 ; y++) {
1832+ const block_f16_7x1 * a_ptr = (const block_f16_7x1*) vy + (y * nb);
1833+ for (int x = 0 ; x < nc / ncols_interleaved; x++) {
1834+ const block_f16<ncols_interleaved, 1 > * b_ptr = (const block_f16<ncols_interleaved, 1 > *) vx + (x * nb);
1835+
1836+ // Accumulators
1837+ vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1838+ vfloat32m4_t sumf_1 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1839+ vfloat32m4_t sumf_2 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1840+ vfloat32m4_t sumf_3 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1841+ vfloat32m4_t sumf_4 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1842+ vfloat32m4_t sumf_5 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1843+ vfloat32m4_t sumf_6 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1844+
1845+ for (int l = 0 ; l < nb; l++) {
1846+ vfloat16m2_t b_0 = __riscv_vle16_v_f16m2 ((const _Float16 *)&b_ptr[l].d [0 ], ncols_interleaved);
1847+
1848+ sumf_0 = __riscv_vfwmacc_vf_f32m4 (sumf_0, *(const _Float16*)&a_ptr[l].d [0 ], b_0, ncols_interleaved);
1849+ sumf_1 = __riscv_vfwmacc_vf_f32m4 (sumf_1, *(const _Float16*)&a_ptr[l].d [1 ], b_0, ncols_interleaved);
1850+ sumf_2 = __riscv_vfwmacc_vf_f32m4 (sumf_2, *(const _Float16*)&a_ptr[l].d [2 ], b_0, ncols_interleaved);
1851+ sumf_3 = __riscv_vfwmacc_vf_f32m4 (sumf_3, *(const _Float16*)&a_ptr[l].d [3 ], b_0, ncols_interleaved);
1852+ sumf_4 = __riscv_vfwmacc_vf_f32m4 (sumf_4, *(const _Float16*)&a_ptr[l].d [4 ], b_0, ncols_interleaved);
1853+ sumf_5 = __riscv_vfwmacc_vf_f32m4 (sumf_5, *(const _Float16*)&a_ptr[l].d [5 ], b_0, ncols_interleaved);
1854+ sumf_6 = __riscv_vfwmacc_vf_f32m4 (sumf_6, *(const _Float16*)&a_ptr[l].d [6 ], b_0, ncols_interleaved);
1855+ }
1856+
1857+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 0 ) * bs + x * ncols_interleaved], sumf_0, ncols_interleaved);
1858+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 1 ) * bs + x * ncols_interleaved], sumf_1, ncols_interleaved);
1859+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 2 ) * bs + x * ncols_interleaved], sumf_2, ncols_interleaved);
1860+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 3 ) * bs + x * ncols_interleaved], sumf_3, ncols_interleaved);
1861+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 4 ) * bs + x * ncols_interleaved], sumf_4, ncols_interleaved);
1862+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 5 ) * bs + x * ncols_interleaved], sumf_5, ncols_interleaved);
1863+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 6 ) * bs + x * ncols_interleaved], sumf_6, ncols_interleaved);
1864+ }
1865+ }
1866+ return ;
1867+ }
1868+
1869+ void ggml_gemm_f16_7x1x16_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1870+ #if defined __riscv_v_intrinsic
1871+ ggml_gemm_f16_7x1xM_f16<16 >(n, s, bs, vx, vy, nr, nc);
1872+ return ;
1873+ #endif
1874+ ggml_gemm_f16_7x1x16_f16_generic (n, s, bs, vx, vy, nr, nc);
1875+ }
1876+
1877+ void ggml_gemm_f16_7x1x32_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1878+ #if defined __riscv_v_intrinsic
1879+ ggml_gemm_f16_7x1xM_f16<32 >(n, s, bs, vx, vy, nr, nc);
1880+ return ;
1881+ #endif
1882+ ggml_gemm_f16_7x1x32_f16_generic (n, s, bs, vx, vy, nr, nc);
1883+ }
1884+
1885+ void ggml_gemm_f16_7x1x64_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1886+ #if defined __riscv_v_intrinsic
1887+ ggml_gemm_f16_7x1xM_f16<64 >(n, s, bs, vx, vy, nr, nc);
1888+ return ;
1889+ #endif
1890+ ggml_gemm_f16_7x1x64_f16_generic (n, s, bs, vx, vy, nr, nc);
1891+ }
1892+
1893+ void ggml_gemm_f16_7x1x128_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1894+ #if defined __riscv_v_intrinsic
1895+ ggml_gemm_f16_7x1xM_f16<128 >(n, s, bs, vx, vy, nr, nc);
1896+ return ;
1897+ #endif
1898+ ggml_gemm_f16_7x1x128_f16_generic (n, s, bs, vx, vy, nr, nc);
1899+ }
1900+
1901+ template <int ncols_interleaved>
1902+ static inline void ggml_gemm_f32_7x1xM_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1903+ const int nb = n / 1 ;
1904+
1905+ assert (nr % 7 == 0 );
1906+ assert (n % 1 == 0 );
1907+ assert (nc % ncols_interleaved == 0 );
1908+
1909+ for (int y = 0 ; y < nr / 7 ; y++) {
1910+ const block_f32_7x1 * a_ptr = (const block_f32_7x1*) vy + (y * nb);
1911+ for (int x = 0 ; x < nc / ncols_interleaved; x++) {
1912+ const block_f32<ncols_interleaved, 1 > * b_ptr = (const block_f32<ncols_interleaved, 1 > *) vx + (x * nb);
1913+
1914+ // Accumulators
1915+ vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1916+ vfloat32m4_t sumf_1 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1917+ vfloat32m4_t sumf_2 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1918+ vfloat32m4_t sumf_3 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1919+ vfloat32m4_t sumf_4 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1920+ vfloat32m4_t sumf_5 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1921+ vfloat32m4_t sumf_6 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1922+
1923+ for (int l = 0 ; l < nb; l++) {
1924+ vfloat32m4_t b_0 = __riscv_vle32_v_f32m4 ((const float *)&b_ptr[l].d [0 ], ncols_interleaved);
1925+
1926+ sumf_0 = __riscv_vfmacc_vf_f32m4 (sumf_0, *(const float *)&a_ptr[l].d [0 ], b_0, ncols_interleaved);
1927+ sumf_1 = __riscv_vfmacc_vf_f32m4 (sumf_1, *(const float *)&a_ptr[l].d [1 ], b_0, ncols_interleaved);
1928+ sumf_2 = __riscv_vfmacc_vf_f32m4 (sumf_2, *(const float *)&a_ptr[l].d [2 ], b_0, ncols_interleaved);
1929+ sumf_3 = __riscv_vfmacc_vf_f32m4 (sumf_3, *(const float *)&a_ptr[l].d [3 ], b_0, ncols_interleaved);
1930+ sumf_4 = __riscv_vfmacc_vf_f32m4 (sumf_4, *(const float *)&a_ptr[l].d [4 ], b_0, ncols_interleaved);
1931+ sumf_5 = __riscv_vfmacc_vf_f32m4 (sumf_5, *(const float *)&a_ptr[l].d [5 ], b_0, ncols_interleaved);
1932+ sumf_6 = __riscv_vfmacc_vf_f32m4 (sumf_6, *(const float *)&a_ptr[l].d [6 ], b_0, ncols_interleaved);
1933+ }
1934+
1935+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 0 ) * bs + x * ncols_interleaved], sumf_0, ncols_interleaved);
1936+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 1 ) * bs + x * ncols_interleaved], sumf_1, ncols_interleaved);
1937+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 2 ) * bs + x * ncols_interleaved], sumf_2, ncols_interleaved);
1938+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 3 ) * bs + x * ncols_interleaved], sumf_3, ncols_interleaved);
1939+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 4 ) * bs + x * ncols_interleaved], sumf_4, ncols_interleaved);
1940+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 5 ) * bs + x * ncols_interleaved], sumf_5, ncols_interleaved);
1941+ __riscv_vse32_v_f32m4 (&s[(y * 7 + 6 ) * bs + x * ncols_interleaved], sumf_6, ncols_interleaved);
1942+ }
1943+ }
1944+ return ;
1945+ }
1946+
1947+ void ggml_gemm_f32_7x1x16_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1948+ #if defined __riscv_v_intrinsic
1949+ ggml_gemm_f32_7x1xM_f32<16 >(n, s, bs, vx, vy, nr, nc);
1950+ return ;
1951+ #endif
1952+ ggml_gemm_f32_7x1x16_f32_generic (n, s, bs, vx, vy, nr, nc);
1953+ }
1954+
1955+ void ggml_gemm_f32_7x1x32_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1956+ #if defined __riscv_v_intrinsic
1957+ ggml_gemm_f32_7x1xM_f32<32 >(n, s, bs, vx, vy, nr, nc);
1958+ return ;
1959+ #endif
1960+ ggml_gemm_f32_7x1x32_f32_generic (n, s, bs, vx, vy, nr, nc);
1961+ }
1962+
1963+ void ggml_gemm_f32_7x1x64_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1964+ #if defined __riscv_v_intrinsic
1965+ ggml_gemm_f32_7x1xM_f32<64 >(n, s, bs, vx, vy, nr, nc);
1966+ return ;
1967+ #endif
1968+ ggml_gemm_f32_7x1x64_f32_generic (n, s, bs, vx, vy, nr, nc);
1969+ }
1970+
1971+ void ggml_gemm_f32_7x1x128_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1972+ #if defined __riscv_v_intrinsic
1973+ ggml_gemm_f32_7x1xM_f32<128 >(n, s, bs, vx, vy, nr, nc);
1974+ return ;
1975+ #endif
1976+ ggml_gemm_f32_7x1x128_f32_generic (n, s, bs, vx, vy, nr, nc);
1977+ }
0 commit comments