From f6ac08424f490b70b94057deb28ab70e8ebc549c Mon Sep 17 00:00:00 2001 From: bepis Date: Mon, 3 Nov 2025 13:27:40 -0800 Subject: [PATCH 1/5] Feat: Added vulkan circular tiling support --- ggml/include/ggml.h | 82 +++++++++++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 12 ++ .../ggml-vulkan/vulkan-shaders/conv2d_dw.comp | 76 ++++++++--- .../ggml-vulkan/vulkan-shaders/conv2d_mm.comp | 17 ++- ggml/src/ggml-vulkan/vulkan-shaders/pad.comp | 26 +++- ggml/src/ggml.c | 127 +++++++++++++++++- 6 files changed, 310 insertions(+), 30 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 2311cdabe3ba4..bb96e1bb5dbb8 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1943,6 +1943,18 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 + + GGML_API struct ggml_tensor * ggml_conv_2d_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + GGML_API struct ggml_tensor * ggml_im2col_3d( struct ggml_context * ctx, struct ggml_tensor * a, @@ -2016,6 +2028,19 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 + + // depthwise (via im2col and mul_mat) + GGML_API struct ggml_tensor * ggml_conv_2d_dw_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + // Depthwise 2D convolution // may be faster than ggml_conv_2d_dw, but not available in all backends // a: KW KH 1 C convolution kernel @@ -2032,12 +2057,35 @@ extern "C" { int dilation0, int dilation1); + // Depthwise 2D convolution (on a torus) + // may be faster than ggml_conv_2d_dw, but not available in all backends + // a: KW KH 1 C convolution kernel + // b: W H C N input data + // res: W_out H_out C N + GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1); + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, int stride); + // circular (on a torus) + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride); + GGML_API struct ggml_tensor * ggml_conv_2d_direct( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC] @@ -2048,6 +2096,17 @@ extern "C" { int p1, // padding dimension 1 int d0, // dilation dimension 0 int d1); // dilation dimension 1 + + GGML_API struct ggml_tensor * ggml_conv_2d_direct_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC] + struct ggml_tensor * b, // input data [W, H, C, N] + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 GGML_API struct ggml_tensor * ggml_conv_3d_direct( struct ggml_context * ctx, @@ -2156,6 +2215,15 @@ extern "C" { int p2, int p3); + // pad each dimension with values on the other side of the torus (looping around) + GGML_API struct ggml_tensor * ggml_pad_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1, + int p2, + int p3); + GGML_API struct ggml_tensor * ggml_pad_ext( struct ggml_context * ctx, struct ggml_tensor * a, @@ -2169,6 +2237,20 @@ extern "C" { int rp3 ); + // circular padding + GGML_API struct ggml_tensor * ggml_pad_ext_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3 + ); + // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c] GGML_API struct ggml_tensor * ggml_pad_reflect_1d( struct ggml_context * ctx, diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 8d1a85c96939b..69b58e32950a5 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -940,6 +940,7 @@ struct vk_op_pad_push_constants { uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t misalign_offsets; + uint32_t circular; uint32_t lp0; uint32_t rp0; uint32_t lp1; uint32_t rp1; @@ -982,6 +983,7 @@ static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor p.rp2 = dst->op_params[5]; p.lp3 = dst->op_params[6]; p.rp3 = dst->op_params[7]; + p.circular = dst->op_params[8]; return p; // fastdiv values and offsets are initialized later in ggml_vk_op } @@ -1249,6 +1251,8 @@ struct vk_op_conv2d_push_constants { uint32_t KWKHmp; uint32_t KWKHL; uint32_t OWmp; uint32_t OWL; uint32_t OWOHmp; uint32_t OWOHL; + + uint32_t circular; }; template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) { @@ -1297,6 +1301,8 @@ struct vk_op_conv_transpose_2d_push_constants { uint32_t OWOHmp; uint32_t OWOHL; uint32_t s0mp; uint32_t s0L; uint32_t s1mp; uint32_t s1L; + + uint32_t circular; }; template <> void init_pushconst_fastdiv(vk_op_conv_transpose_2d_push_constants &p) { @@ -1325,6 +1331,7 @@ struct vk_op_conv2d_dw_push_constants { int32_t pad_y; int32_t dilation_x; int32_t dilation_y; + uint32_t circular; }; struct vk_op_upscale_push_constants { @@ -10420,6 +10427,8 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, p.nb2 = static_cast(nb2 / nb0); p.nb3 = static_cast(nb3 / nb0); + p.circular = static_cast(dst->op_params[6]); + GGML_ASSERT(ne03 == ne2); GGML_ASSERT(ne02 == ne12); @@ -10469,6 +10478,8 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context p.nb2 = static_cast(nb2 / nb0); p.nb3 = static_cast(nb3 / nb0); + p.circular = static_cast(dst->op_params[1]); + GGML_ASSERT(ne02 == ne2); GGML_ASSERT(ne03 == ne12); @@ -10492,6 +10503,7 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx p.pad_y = dst->op_params[3]; p.dilation_x = dst->op_params[4]; p.dilation_y = dst->op_params[5]; + p.circular = dst->op_params[6]; GGML_ASSERT(src0->ne[3] == p.channels); GGML_ASSERT(src1->ne[3] == p.batches); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp index 70a301488eb1d..88bd1d7a755f6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp @@ -19,6 +19,7 @@ layout (push_constant) uniform parameter int pad_y; int dilation_x; int dilation_y; + uint circular; } p; layout (binding = 0) readonly buffer A {A_TYPE knl_data[];}; @@ -27,6 +28,10 @@ layout (binding = 2) writeonly buffer D {D_TYPE dst_data[];}; layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; +uint32_t wrap_coord(int coord, uint32_t size) { + return uint32_t((uint(coord + int(size))) % size); +} + FLOAT_TYPE conv_2d_dw_whcn(uint idx) { uint i0 = idx / p.dst_w; uint dst_x = idx - i0 * p.dst_w; @@ -39,19 +44,35 @@ FLOAT_TYPE conv_2d_dw_whcn(uint idx) { uint knl_i = c * p.knl_h * p.knl_w; FLOAT_TYPE sum = 0.0; - for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { - uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; - if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int - continue; + + if (p.circular != 0u) { + for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { + int raw_y = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y; + uint src_y = wrap_coord(raw_y, p.src_h); + for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { + int raw_x = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x; + uint src_x = wrap_coord(raw_x, p.src_w); + FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]); + FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]); + sum = fma(v, k, sum); + } } - for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { - uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; - if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int + } + else { + for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { + uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; + if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int continue; } - FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]); - FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]); - sum = fma(v, k, sum); + for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { + uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; + if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int + continue; + } + FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]); + FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]); + sum = fma(v, k, sum); + } } } return sum; @@ -70,19 +91,34 @@ FLOAT_TYPE conv_2d_dw_cwhn(uint idx) { uint knl_row = p.knl_w * p.channels; FLOAT_TYPE sum = 0.0; - for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { - uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; - if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int - continue; + if (p.circular != 0u) { + for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { + int raw_y = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y; + uint src_y = wrap_coord(raw_y, p.src_h); + for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { + int raw_x = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x; + uint src_x = wrap_coord(raw_x, p.src_w); + FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]); + FLOAT_TYPE k = FLOAT_TYPE(knl_data[ knl_y * knl_row + knl_x * p.channels + c]); + sum = fma(v, k, sum); + } } - for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { - uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; - if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int + } + else { + for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) { + uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; + if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int continue; } - FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]); - FLOAT_TYPE k = FLOAT_TYPE(knl_data[ knl_y * knl_row + knl_x * p.channels + c]); - sum = fma(v, k, sum); + for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) { + uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; + if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int + continue; + } + FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]); + FLOAT_TYPE k = FLOAT_TYPE(knl_data[ knl_y * knl_row + knl_x * p.channels + c]); + sum = fma(v, k, sum); + } } } return sum; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp index 0367e80bbfa73..c18fd92ccaf9a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp @@ -70,6 +70,8 @@ layout(push_constant) uniform parameter { uint32_t s0mp; uint32_t s0L; uint32_t s1mp; uint32_t s1L; #endif + + uint32_t circular; } p; @@ -174,6 +176,10 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T } #endif +uint32_t wrap_coord(int coord, uint32_t size) { + return uint32_t((uint(coord + int(size))) % size); +} + void main() { #ifdef COOPMAT2 coopmat matC; @@ -274,7 +280,8 @@ void main() { KH_idx_b = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW; KW_idx_b = CRS_remainder - KH_idx_b * p.KW; #endif - + uint32_t H_pos; + uint32_t W_pos; #ifdef TRANSPOSE uint32_t H_idx_x_s1 = OH_idx - KH_idx_b * p.d1 + p.p1; uint32_t W_idx_x_s0 = OW_idx - KW_idx_b * p.d0 + p.p0; @@ -284,13 +291,15 @@ void main() { uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1; uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0; #endif + H_pos = (p.circular != 0) ? wrap_coord(int(H_idx), p.H) : H_idx; + W_pos = (p.circular != 0) ? wrap_coord(int(W_idx), p.W) : W_idx; uint32_t src_idx = - min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1); + min(max(W_pos + H_pos * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1); float val = src_data[src_idx]; if (CRS_idx_b >= CRS || NPQ_idx >= NPQ - || H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case) + || H_pos >= p.H || W_pos >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case) #ifdef TRANSPOSE - || (H_idx_x_s1 - H_idx * p.s1 != 0) || (W_idx_x_s0 - W_idx * p.s0 != 0) + || (H_idx_x_s1 - H_pos * p.s1 != 0) || (W_idx_x_s0 - W_pos * p.s0 != 0) #endif ) { val = 0.0; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp index f3c8176872758..f2fd5929bf41d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp @@ -8,6 +8,7 @@ layout (push_constant) uniform parameter uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03; uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint misalign_offsets; + uint circular; uint lp0; uint rp0; uint lp1; uint rp1; @@ -18,6 +19,10 @@ layout (push_constant) uniform parameter uint get_aoffset() { return p.misalign_offsets >> 16; } uint get_doffset() { return p.misalign_offsets & 0xFFFF; } +uint wrap_coord(int coord, uint size) { + return (uint(coord + int(size))) % size; // add size to avoid issues with negative +} + layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; @@ -40,10 +45,21 @@ void main() { const uint src0_idx = (i3 - p.lp3)*p.nb03 + (i2 - p.lp2)*p.nb02 + (i1 - p.lp1)*p.nb01 + (i0 - p.lp0)*p.nb00; const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10; - const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 && - i1 >= p.lp1 && i1 < p.ne11 - p.rp1 && - i2 >= p.lp2 && i2 < p.ne12 - p.rp2 && - i3 >= p.lp3 && i3 < p.ne13 - p.rp3; + if (p.circular != 0u) { + const uint ci0 = wrap_coord(int(i0) - int(p.lp0), p.ne00); + const uint ci1 = wrap_coord(int(i1) - int(p.lp1), p.ne01); + const uint ci2 = wrap_coord(int(i2) - int(p.lp2), p.ne02); + const uint ci3 = wrap_coord(int(i3) - int(p.lp3), p.ne03); + const uint circular_src_idx = ci3*p.nb03 + ci2*p.nb02 + ci1*p.nb01 + ci0*p.nb00; + data_d[get_doffset() + dst_idx] = D_TYPE(data_a[get_aoffset() + circular_src_idx]); + } + else { + const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 && + i1 >= p.lp1 && i1 < p.ne11 - p.rp1 && + i2 >= p.lp2 && i2 < p.ne12 - p.rp2 && + i3 >= p.lp3 && i3 < p.ne13 - p.rp3; + data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f); + } + - data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9be35c1be8456..1113164f48865 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4457,6 +4457,29 @@ struct ggml_tensor * ggml_conv_2d( return result; } + +// ggml_conv_2d_circular + +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OC, OH, OW] +struct ggml_tensor * ggml_conv_2d_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + if (p0 == 0 && p1 == 0) { + return ggml_conv_2d(ctx, a, b, s0, s1, p0, p1, d0, d1); + } + struct ggml_tensor * b_padded = ggml_pad_ext_circular(ctx, b, p0, p0, p1, p1, 0, 0, 0, 0); + return ggml_conv_2d(ctx, a, b_padded, s0, s1, 0, 0, d0, d1); +} + // a: [OC*IC, KD, KH, KW] // b: [N*IC, ID, IH, IW] // result: [N*OD, OH, OW, IC * KD * KH * KW] @@ -4585,6 +4608,25 @@ struct ggml_tensor * ggml_conv_2d_dw( return result; } +// ggml_conv_2d_dw_circular + +struct ggml_tensor * ggml_conv_2d_dw_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + if (p0 == 0 && p1 == 0) { + return ggml_conv_2d_dw(ctx, a, b, s0, s1, p0, p1, d0, d1); + } + struct ggml_tensor * b_padded = ggml_pad_ext_circular(ctx, b, p0, p0, p1, p1, 0, 0, 0, 0); + return ggml_conv_2d_dw(ctx, a, b_padded, s0, s1, 0, 0, d0, d1); +} + // ggml_conv_2d_dw_direct struct ggml_tensor * ggml_conv_2d_dw_direct( @@ -4616,7 +4658,9 @@ struct ggml_tensor * ggml_conv_2d_dw_direct( result->nb[2] = type_size; } - int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 }; + int circular = 0; // default not circular + + int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1, circular }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_CONV_2D_DW; @@ -4625,6 +4669,24 @@ struct ggml_tensor * ggml_conv_2d_dw_direct( return result; } +// ggml_conv_2d_dw_direct_circular + +struct ggml_tensor * ggml_conv_2d_dw_direct_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1) { + struct ggml_tensor * result = + ggml_conv_2d_dw_direct(ctx, a, b, stride0, stride1, pad0, pad1, dilation0, dilation1); + ggml_set_op_params_i32(result, 6, 1); // circular + return result; +} + // ggml_conv_2d_direct struct ggml_tensor * ggml_conv_2d_direct( @@ -4655,6 +4717,7 @@ struct ggml_tensor * ggml_conv_2d_direct( ggml_set_op_params_i32(result, 3, p1); ggml_set_op_params_i32(result, 4, d0); ggml_set_op_params_i32(result, 5, d1); + ggml_set_op_params_i32(result, 6, 0); // default not circularc result->op = GGML_OP_CONV_2D; result->src[0] = a; @@ -4663,6 +4726,23 @@ struct ggml_tensor * ggml_conv_2d_direct( return result; } +// ggml_conv_2d_direct_circular + +struct ggml_tensor * ggml_conv_2d_direct_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + struct ggml_tensor * result = ggml_conv_2d_direct(ctx, a, b, s0, s1, p0, p1, d0, d1); + ggml_set_op_params_i32(result, 6, 1); // circular + return result; +} + // ggml_conv_3d_direct struct ggml_tensor * ggml_conv_3d_direct( @@ -4735,6 +4815,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); ggml_set_op_params_i32(result, 0, stride); + ggml_set_op_params_i32(result, 1, 0); // circular default off result->op = GGML_OP_CONV_TRANSPOSE_2D; result->src[0] = a; @@ -4743,6 +4824,18 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( return result; } +// ggml_conv_transpose_2d_p0_circular + +struct ggml_tensor * ggml_conv_transpose_2d_p0_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride) { + struct ggml_tensor * result = ggml_conv_transpose_2d_p0(ctx, a, b, stride); + ggml_set_op_params_i32(result, 1, 1); // circular enabled + return result; +} + // ggml_pool_* static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) { @@ -4894,6 +4987,18 @@ struct ggml_tensor * ggml_pad( return ggml_pad_ext(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3); } +// ggml_pad_circular + +struct ggml_tensor * ggml_pad_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1, + int p2, + int p3) { + return ggml_pad_ext_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3); +} + struct ggml_tensor * ggml_pad_ext( struct ggml_context * ctx, struct ggml_tensor * a, @@ -4920,6 +5025,7 @@ struct ggml_tensor * ggml_pad_ext( ggml_set_op_params_i32(result, 5, rp2); ggml_set_op_params_i32(result, 6, lp3); ggml_set_op_params_i32(result, 7, rp3); + ggml_set_op_params_i32(result, 8, 0); // not circular by default result->op = GGML_OP_PAD; @@ -4928,6 +5034,25 @@ struct ggml_tensor * ggml_pad_ext( return result; } +// ggml_pad_ext_circular + +struct ggml_tensor * ggml_pad_ext_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3 + ) { + struct ggml_tensor * result = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + ggml_set_op_params_i32(result, 8, 1); // circular + return result; +} + // ggml_pad_reflect_1d struct ggml_tensor * ggml_pad_reflect_1d( From d7f5958b9e3f3553e211e9cdab83f3c222c504fa Mon Sep 17 00:00:00 2001 From: bepis Date: Mon, 3 Nov 2025 13:50:52 -0800 Subject: [PATCH 2/5] Feat: Added cpu circular --- ggml/src/ggml-cpu/ops.cpp | 324 +++++++++++++++++++++++++++----------- 1 file changed, 232 insertions(+), 92 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index f66d36ff62c03..92fd172769abc 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6674,6 +6674,10 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params ggml_compute_forward_mul_mat(params, &dst); } +static inline int64_t ggml_wrap_coord(int64_t coord, int64_t size) { + return (coord + size) % size; // adding size avoids negative number weirdness +} + // ggml_compute_forward_conv_2d static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params, @@ -6694,6 +6698,7 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params const int32_t pad_y = dst->op_params[3]; const int32_t dilation_x = dst->op_params[4]; const int32_t dilation_y = dst->op_params[5]; + const bool circular = dst->op_params[6]; const int64_t c_in = src->ne[2]; const int64_t c_out = kernel->ne[3]; @@ -6734,40 +6739,73 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params const int64_t patch_end = std::min(patch_start + patch_per_thread, patch_end_batch); //im2col for a patch - for (int64_t p = patch_start; p < patch_end; ++p) { - const int64_t batch_n = p / (dst_w * dst_h); - const int64_t src_x = (p / dst_w) % dst_h; - const int64_t src_y = p % dst_w; + if (circular == 0) { + for (int64_t p = patch_start; p < patch_end; ++p) { + const int64_t batch_n = p / (dst_w * dst_h); + const int64_t src_x = (p / dst_w) % dst_h; + const int64_t src_y = p % dst_w; - const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]); - char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size; + const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]); + char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size; - for (int64_t ic = 0; ic < c_in; ++ic) { - for (int64_t ky = 0; ky < knl_h; ++ky) { - for (int64_t kx = 0; kx < knl_w; ++kx) { - const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y; - const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x; + for (int64_t ic = 0; ic < c_in; ++ic) { + for (int64_t ky = 0; ky < knl_h; ++ky) { + for (int64_t kx = 0; kx < knl_w; ++kx) { + const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y; + const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x; - int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx; + int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx; - float src_val; - if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { - src_val = 0.0f; - } else { - const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); - src_val = *src_ptr; + float src_val; + if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { + src_val = 0.0f; + } else { + const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); + src_val = *src_ptr; + } + + char * element_ptr = dst_row + dst_idx * traits->type_size; + if (kernel_type == GGML_TYPE_F32) { + *(float *) element_ptr = src_val; + } else if (kernel_type == GGML_TYPE_F16) { + *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val); + } } + } + } + } // patches handled by this thread + } + else { + for (int64_t p = patch_start; p < patch_end; ++p) { + const int64_t batch_n = p / (dst_w * dst_h); + const int64_t src_x = (p / dst_w) % dst_h; + const int64_t src_y = p % dst_w; + + const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]); + char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size; + + for (int64_t ic = 0; ic < c_in; ++ic) { + for (int64_t ky = 0; ky < knl_h; ++ky) { + for (int64_t kx = 0; kx < knl_w; ++kx) { + const int64_t sy = ggml_wrap_coord(src_x * stride_y + ky * dilation_y - pad_y, src_h); + const int64_t sx = ggml_wrap_coord(src_y * stride_x + kx * dilation_x - pad_x, src_w); + + int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx; - char * element_ptr = dst_row + dst_idx * traits->type_size; - if (kernel_type == GGML_TYPE_F32) { - *(float *) element_ptr = src_val; - } else if (kernel_type == GGML_TYPE_F16) { - *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val); + const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); + float src_val = *src_ptr; + char * element_ptr = dst_row + dst_idx * traits->type_size; + if (kernel_type == GGML_TYPE_F32) { + *(float *) element_ptr = src_val; + } else if (kernel_type == GGML_TYPE_F16) { + *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val); + } } } } - } - } // patches handled by this thread + } // patches handled by this thread + } + ggml_barrier(params->threadpool); @@ -7066,6 +7104,7 @@ struct ggml_conv_2d_dw_params { int pad_y; int dilation_x; int dilation_y; + int circular; }; static void ggml_compute_forward_conv_2d_dw_cwhn( @@ -7091,57 +7130,103 @@ static void ggml_compute_forward_conv_2d_dw_cwhn( const int64_t c_pkg_end = 0; #endif - for (int64_t row = row_start; row < row_end; ++row) { - const int64_t dst_y = row % p.dst_h; - const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c; - for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { - float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c; - const int64_t src_y_base = dst_y * p.stride_y - p.pad_y; - const int64_t src_x_base = dst_x * p.stride_x - p.pad_x; + const int64_t circular = p.circular; -#ifdef GGML_SIMD - // Vectorized loop - for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) { - GGML_F32_VEC sum = GGML_F32_VEC_ZERO; - for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { - const int64_t src_y = src_y_base + knl_y * p.dilation_y; - if (src_y < 0 || src_y >= p.src_h) { - continue; + if (circular == 0) { + for (int64_t row = row_start; row < row_end; ++row) { + const int64_t dst_y = row % p.dst_h; + const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c; + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c; + const int64_t src_y_base = dst_y * p.stride_y - p.pad_y; + const int64_t src_x_base = dst_x * p.stride_x - p.pad_x; + + #ifdef GGML_SIMD + // Vectorized loop + for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) { + GGML_F32_VEC sum = GGML_F32_VEC_ZERO; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i); + GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i); + sum = GGML_F32_VEC_FMA(sum, k, s); + } } - for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { - const int64_t src_x = src_x_base + knl_x * p.dilation_x; - if (src_x < 0 || src_x >= p.src_w) { + GGML_F32_VEC_STORE(dst_data + c_i, sum); + } + #endif + // Scalar loop + for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) { + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (src_y < 0 || src_y >= p.src_h) { continue; } - GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i); - GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i); - sum = GGML_F32_VEC_FMA(sum, k, s); + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i] + * src_data[(src_y * p.src_w + src_x) * c + c_i]; + } } + dst_data[c_i] = sum; } - GGML_F32_VEC_STORE(dst_data + c_i, sum); } -#endif - // Scalar loop - for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) { - float sum = 0.0f; - for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { - const int64_t src_y = src_y_base + knl_y * p.dilation_y; - if (src_y < 0 || src_y >= p.src_h) { - continue; + } + } + else { + for (int64_t row = row_start; row < row_end; ++row) { + const int64_t dst_y = row % p.dst_h; + const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c; + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c; + const int64_t src_y_base = dst_y * p.stride_y - p.pad_y; + const int64_t src_x_base = dst_x * p.stride_x - p.pad_x; + + #ifdef GGML_SIMD + // Vectorized loop + for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) { + GGML_F32_VEC sum = GGML_F32_VEC_ZERO; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = ggml_wrap_coord(src_y_base + knl_y * p.dilation_y, p.src_h); + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = ggml_wrap_coord(src_x_base + knl_x * p.dilation_x, p.src_w); + GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i); + GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i); + sum = GGML_F32_VEC_FMA(sum, k, s); + } } - for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { - const int64_t src_x = src_x_base + knl_x * p.dilation_x; - if (src_x < 0 || src_x >= p.src_w) { - continue; + GGML_F32_VEC_STORE(dst_data + c_i, sum); + } + #endif + // Scalar loop + for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) { + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = ggml_wrap_coord(src_y_base + knl_y * p.dilation_y, p.src_h); + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = ggml_wrap_coord(src_x_base + knl_x * p.dilation_x, p.src_w); + sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i] + * src_data[(src_y * p.src_w + src_x) * c + c_i]; } - sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i] - * src_data[(src_y * p.src_w + src_x) * c + c_i]; } + dst_data[c_i] = sum; } - dst_data[c_i] = sum; } } } + } static void ggml_compute_forward_conv_2d_dw_whcn( @@ -7156,30 +7241,57 @@ static void ggml_compute_forward_conv_2d_dw_whcn( const int64_t start = params->ith * per_thread; const int64_t end = MIN(start + per_thread, n); - for (int64_t i = start; i < end; ++i) { - const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h; - const float * src_data = (const float *)src->data + i * p.src_w * p.src_h; - float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h; + const int64_t circular = p.circular; - for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) { - for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + if (circular == 0) { + for (int64_t i = start; i < end; ++i) { + const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h; + const float * src_data = (const float *)src->data + i * p.src_w * p.src_h; + float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h; - float sum = 0.0f; - for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { - const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; - if (src_y < 0 || src_y >= p.src_h) { - continue; - } - for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { - const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; - if (src_x < 0 || src_x >= p.src_w) { + for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) { + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; + if (src_y < 0 || src_y >= p.src_h) { continue; } - sum += knl_data[knl_y * p.knl_w + knl_x] - * src_data[src_y * p.src_w + src_x]; + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + sum += knl_data[knl_y * p.knl_w + knl_x] + * src_data[src_y * p.src_w + src_x]; + } } + dst_data[dst_y * p.dst_w + dst_x] = sum; + } + } + } + } + else { + for (int64_t i = start; i < end; ++i) { + const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h; + const float * src_data = (const float *)src->data + i * p.src_w * p.src_h; + float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h; + + for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) { + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = ggml_wrap_coord(dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y, p.src_h); + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = ggml_wrap_coord(dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x, p.src_w); + sum += knl_data[knl_y * p.knl_w + knl_x] + * src_data[src_y * p.src_w + src_x]; + } + } + dst_data[dst_y * p.dst_w + dst_x] = sum; } - dst_data[dst_y * p.dst_w + dst_x] = sum; } } } @@ -7206,6 +7318,7 @@ void ggml_compute_forward_conv_2d_dw( p.pad_y = dst->op_params[3]; p.dilation_x = dst->op_params[4]; p.dilation_y = dst->op_params[5]; + p.circular = dst->op_params[6]; GGML_ASSERT(kernel->ne[3] == p.channels); GGML_ASSERT(dst->ne[3] == p.batch); @@ -7626,24 +7739,51 @@ static void ggml_compute_forward_pad_f32( const int32_t rp2 = ggml_get_op_params_i32(dst, 5); const int32_t lp3 = ggml_get_op_params_i32(dst, 6); const int32_t rp3 = ggml_get_op_params_i32(dst, 7); + const int32_t circular = ggml_get_op_params_i32(dst, 8); // TODO: optimize - for (int64_t i2 = 0; i2 < ne2; ++i2) { - for (int64_t i1 = ith; i1 < ne1; i1 += nth) { - for (int64_t i0 = 0; i0 < ne0; ++i0) { - for (int64_t i3 = 0; i3 < ne3; ++i3) { - const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; - if ((i0 >= lp0 && i0 < ne0 - rp0) \ - && (i1 >= lp1 && i1 < ne1 - rp1) \ - && (i2 >= lp2 && i2 < ne2 - rp2) \ - && (i3 >= lp3 && i3 < ne3 - rp3)) { - const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00; + if (circular == 0) { + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = ith; i1 < ne1; i1 += nth) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + for (int64_t i3 = 0; i3 < ne3; ++i3) { + const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; + if ((i0 >= lp0 && i0 < ne0 - rp0) \ + && (i1 >= lp1 && i1 < ne1 - rp1) \ + && (i2 >= lp2 && i2 < ne2 - rp2) \ + && (i3 >= lp3 && i3 < ne3 - rp3)) { + const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00; + const float * src_ptr = (const float *)((char *) src0->data + src_idx); + dst_ptr[dst_idx] = *src_ptr; + } else { + dst_ptr[dst_idx] = 0; + } + } + } + } + } + } + else { + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = ith; i1 < ne1; i1 += nth) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + for (int64_t i3 = 0; i3 < ne3; ++i3) { + const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; + const int64_t src_i0 = ggml_wrap_coord(i0 - lp0, ne00); + const int64_t src_i1 = ggml_wrap_coord(i1 - lp1, ne01); + const int64_t src_i2 = ggml_wrap_coord(i2 - lp2, ne02); + const int64_t src_i3 = ggml_wrap_coord(i3 - lp3, ne03); + + const int64_t src_idx = + src_i3*nb03 + + src_i2*nb02 + + src_i1*nb01 + + src_i0*nb00; + const float * src_ptr = (const float *)((char *) src0->data + src_idx); dst_ptr[dst_idx] = *src_ptr; - } else { - dst_ptr[dst_idx] = 0; } } } From 1b62b49aa82bf1e2eb69a43f77881ab6b3d6b5f9 Mon Sep 17 00:00:00 2001 From: bepis Date: Mon, 3 Nov 2025 14:34:56 -0800 Subject: [PATCH 3/5] Feat: Added cuda kernels --- ggml/src/ggml-cuda/conv2d-dw.cu | 71 ++++++++++++++++------ ggml/src/ggml-cuda/conv2d-transpose.cu | 82 ++++++++++++++++++-------- ggml/src/ggml-cuda/conv2d.cu | 60 ++++++++++++++----- ggml/src/ggml-cuda/pad.cu | 66 +++++++++++++++------ 4 files changed, 204 insertions(+), 75 deletions(-) diff --git a/ggml/src/ggml-cuda/conv2d-dw.cu b/ggml/src/ggml-cuda/conv2d-dw.cu index 7583233b1b7cd..255f131a6ae2d 100644 --- a/ggml/src/ggml-cuda/conv2d-dw.cu +++ b/ggml/src/ggml-cuda/conv2d-dw.cu @@ -8,6 +8,7 @@ struct conv_params { int padding_x, padding_y; int dilation_x, dilation_y; int channels, batches; + int circular; }; struct kernel_bounds { @@ -17,14 +18,23 @@ struct kernel_bounds { __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) { kernel_bounds bounds; - bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y); - bounds.y_max = - min(params.kernel_h, - (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y); - bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x); - bounds.x_max = - min(params.kernel_w, - (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x); + if (params.circular) { + bounds.y_min = 0; + bounds.y_max = params.kernel_h; + bounds.x_min = 0; + bounds.x_max = params.kernel_w; + } + else { + bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y); + bounds.y_max = + min(params.kernel_h, + (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y); + bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x); + bounds.x_max = + min(params.kernel_w, + (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x); + + } return bounds; } @@ -32,6 +42,10 @@ __device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coo return out_coord * stride + kern_coord * dilation - padding; } +__device__ __forceinline__ int wrap_coord(int coord, int size) { + return (coord % size + size) % size; +} + struct whcn_layout { __device__ static int input_index(int n, int c, int y, int x, const conv_params & params) { return n * (params.channels * params.in_w * params.in_h) + c * params.in_w * params.in_h + y * params.in_w + x; @@ -83,7 +97,8 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr const int in_w, const int in_h, const int out_w, const int out_h, const int kernel_w, const int kernel_h, const int stride_x, const int stride_y, const int padding_x, const int padding_y, const int dilation_x, const int dilation_y, - const int channels, const int batches) { + const int channels, const int batches, + const int circular) { const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; const int total_elements = batches * channels * out_h * out_w; @@ -92,7 +107,7 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr } conv_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, - stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches }; + stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches, circular }; int batch_idx, channel_idx, out_y_idx, out_x_idx; Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx); @@ -100,18 +115,35 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr T accumulator = 0; kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params); - for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) { - int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y); + if (params.circular == 0) { + for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) { + int src_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y); + + for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) { + int src_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x); + + const T input_val = input[Layout::input_index(batch_idx, channel_idx, src_y_idx, src_x_idx, params)]; + const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)]; + + accumulator += input_val * kernel_val; + } + } + } + else { + for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) { + int in_y_idx = wrap_coord(calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y), params.in_h); - for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) { - int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x); + for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) { + int in_x_idx = wrap_coord(calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x), params.in_w); - const T input_val = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)]; - const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)]; + const T input_val = input[Layout::input_index(batch_idx, channel_idx, src_y_idx, src_x_idx, params)]; + const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)]; - accumulator += input_val * kernel_val; + accumulator += input_val * kernel_val; + } } } + output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator; } @@ -132,6 +164,7 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const int padding_y = p[3]; const int dilation_x = p[4]; const int dilation_y = p[5]; + const int circular = p[6]; const int in_w = input->ne[0]; const int in_h = input->ne[1]; @@ -150,11 +183,11 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) if (ggml_is_contiguous(input)) { conv2d_dw_kernel<<>>( x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y, - dilation_x, dilation_y, channels, batches); + dilation_x, dilation_y, channels, batches, circular); } else if (ggml_is_contiguous_channels(input)) { conv2d_dw_kernel<<>>( x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y, - dilation_x, dilation_y, channels, batches); + dilation_x, dilation_y, channels, batches, circular); } else { GGML_ABORT("Unsupported memory layout for conv_2d_dw"); } diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cu b/ggml/src/ggml-cuda/conv2d-transpose.cu index 03224e404d32d..253f1dc3613de 100644 --- a/ggml/src/ggml-cuda/conv2d-transpose.cu +++ b/ggml/src/ggml-cuda/conv2d-transpose.cu @@ -3,10 +3,16 @@ #include "conv2d-transpose.cuh" #include "ggml.h" + +__device__ __forceinline__ int wrap_coord(int coord, int size) { + return (coord % size + size) % size; +} + __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel, float * __restrict__ output, const int in_w, const int in_h, const int out_w, const int out_h, const int kernel_w, const int kernel_h, const int stride, - const int c_in, const int c_out, const int batches) { + const int c_in, const int c_out, const int batches, + const int circular) { const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; const int total_elements = out_w * out_h * c_out * batches; @@ -22,28 +28,55 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const float accumulator = 0; // For each output idx, find the inputs that contribute to it by checking stride alignment and bounds - - for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) { - for (int kh = 0; kh < kernel_h; ++kh) { - int in_y = out_y_idx - kh; - if (in_y < 0 || in_y % stride) continue; - in_y /= stride; - if (in_y >= in_h) continue; - - for (int kw = 0; kw < kernel_w; ++kw) { - int in_x = out_x_idx - kw; - if (in_x < 0 || in_x % stride) continue; - in_x /= stride; - if (in_x >= in_w) continue; - - const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x; - const int kernel_idx = - (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw; - - float input_val = input[input_idx]; - half kern_val = kernel[kernel_idx]; - - accumulator += input_val * (float) kern_val; + if (circular == 0) { + for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) { + for (int kh = 0; kh < kernel_h; ++kh) { + int in_y = out_y_idx - kh; + if (in_y < 0 || in_y % stride) continue; + in_y /= stride; + if (in_y >= in_h) continue; + + for (int kw = 0; kw < kernel_w; ++kw) { + int in_x = out_x_idx - kw; + if (in_x < 0 || in_x % stride) continue; + in_x /= stride; + if (in_x >= in_w) continue; + + const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x; + const int kernel_idx = + (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw; + + float input_val = input[input_idx]; + half kern_val = kernel[kernel_idx]; + + accumulator += input_val * (float) kern_val; + } + } + } + } + else { + for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) { + for (int kh = 0; kh < kernel_h; ++kh) { + int in_y = out_y_idx - kh; + if (in_y % stride) continue; + in_y /= stride; + in_y = wrap_coord(in_y, in_h); + + for (int kw = 0; kw < kernel_w; ++kw) { + int in_x = out_x_idx - kw; + if (in_x % stride) continue; + in_x /= stride; + in_x = wrap_coord(in_x, in_w); + + const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x; + const int kernel_idx = + (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw; + + float input_val = input[input_idx]; + half kern_val = kernel[kernel_idx]; + + accumulator += input_val * (float) kern_val; + } } } } @@ -72,6 +105,7 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor const int kernel_h = kernel->ne[1]; const int stride = dst->op_params[0]; const int batches = input->ne[3]; + const int circular = dst->op_params[1]; GGML_ASSERT(channels_in == kernel->ne[3]); GGML_ASSERT(stride > 0); @@ -87,5 +121,5 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor conv2d_transpose_kernel<<>>( input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride, - channels_in, channels_out, batches); + channels_in, channels_out, batches, circular); } diff --git a/ggml/src/ggml-cuda/conv2d.cu b/ggml/src/ggml-cuda/conv2d.cu index 142dd66903aaa..e9d78df2c88bd 100644 --- a/ggml/src/ggml-cuda/conv2d.cu +++ b/ggml/src/ggml-cuda/conv2d.cu @@ -11,6 +11,7 @@ struct conv_params { const int64_t IC, OC; const int64_t B; const int64_t TOTAL; + const int64_t CIRCULAR; }; struct kernel_bounds { @@ -26,12 +27,24 @@ __device__ __forceinline__ int64_t min64(int64_t a, int64_t b) { return (a < b) ? a : b; } +__device__ __forceinline__ int wrap_coord(int coord, int size) { + return (coord % size + size) % size; +} + __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv_params & P) { kernel_bounds bounds; - bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y); - bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y); - bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X); - bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X); + if (P.CIRCULAR) { + bounds.y_min = 0; + bounds.y_max = P.KH; + bounds.x_min = 0; + bounds.x_max = P.KW; + } + else { + bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y); + bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y); + bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X); + bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X); + } return bounds; } @@ -84,19 +97,37 @@ static __global__ void conv2d_kernel(const float * __restrict__ input, Layout::unpack_indices(global_idx, P, n, c_out, out_y, out_x); float acc = 0.0f; + if (P.CIRCULAR == 0) { + for (int64_t c_in = 0; c_in < P.IC; ++c_in) { + kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P); + + for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) { + const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y); - for (int64_t c_in = 0; c_in < P.IC; ++c_in) { - kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P); + for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) { + const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X); + + const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)]; + const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)]; + acc += (input_val * ggml_cuda_cast(kernel_val)); + } + } + } + } + else { + for (int64_t c_in = 0; c_in < P.IC; ++c_in) { + kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P); - for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) { - const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y); + for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) { + const int64_t in_y = wrap_coord(calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y), P.IH); - for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) { - const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X); + for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) { + const int64_t in_x = wrap_coord(calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X), P.IW); - const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)]; - const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)]; - acc += (input_val * ggml_cuda_cast(kernel_val)); + const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)]; + const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)]; + acc += (input_val * ggml_cuda_cast(kernel_val)); + } } } } @@ -141,6 +172,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int PD_Y = p[3]; // padding_y const int DL_X = p[4]; // dilation_x const int DL_Y = p[5]; // dilation_y + const int CIRCULAR = p[6]; // No cwhn GGML_ASSERT(p[6] == false); @@ -156,7 +188,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int B = input->ne[3]; // n_batches const int64_t total = B * OC * OH * OW; - conv_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total }; + conv_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total, CIRCULAR }; if (kernel->type == GGML_TYPE_F16) { conv2d_cuda_f16(X_D, (half *) K_D, Y_D, params, st); diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu index 29aef33c1a4b8..f3f06897e4299 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -1,9 +1,18 @@ +#include + #include "pad.cuh" + + +__device__ __forceinline__ int64_t wrap_coord(int64_t coord, int64_t size) { + return (coord % size + size) % size; +} + static __global__ void pad_f32(const float * src, float * dst, const int lp0, const int rp0, const int lp1, const int rp1, const int lp2, const int rp2, const int lp3, const int rp3, - const int ne0, const int ne1, const int ne2, const int ne3) { + const int ne0, const int ne1, const int ne2, const int ne3, + const int circular) { // blockIdx.z: i3*ne2+i2 // blockIdx.y: i1 // blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE @@ -12,39 +21,59 @@ static __global__ void pad_f32(const float * src, float * dst, int i1 = blockIdx.y; int i2 = blockIdx.z % ne2; int i3 = blockIdx.z / ne2; + if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { return; } - - // operation + const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; - if ((i0 >= lp0 && i0 < ne0 - rp0) && - (i1 >= lp1 && i1 < ne1 - rp1) && - (i2 >= lp2 && i2 < ne2 - rp2) && - (i3 >= lp3 && i3 < ne3 - rp3)) { - const int64_t i00 = i0 - lp0; - const int64_t i01 = i1 - lp1; - const int64_t i02 = i2 - lp2; - const int64_t i03 = i3 - lp3; - const int64_t ne02 = ne2 - lp2 - rp2; - const int64_t ne01 = ne1 - lp1 - rp1; + + if (circular == 0) { + // operation + if ((i0 >= lp0 && i0 < ne0 - rp0) && + (i1 >= lp1 && i1 < ne1 - rp1) && + (i2 >= lp2 && i2 < ne2 - rp2) && + (i3 >= lp3 && i3 < ne3 - rp3)) { + const int64_t i00 = i0 - lp0; + const int64_t i01 = i1 - lp1; + const int64_t i02 = i2 - lp2; + const int64_t i03 = i3 - lp3; + const int64_t ne02 = ne2 - lp2 - rp2; + const int64_t ne01 = ne1 - lp1 - rp1; + const int64_t ne00 = ne0 - lp0 - rp0; + + const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00; + + dst[dst_idx] = src[src_idx]; + } else { + dst[dst_idx] = 0.0f; + } + } + else { const int64_t ne00 = ne0 - lp0 - rp0; + const int64_t ne01 = ne1 - lp1 - rp1; + const int64_t ne02 = ne2 - lp2 - rp2; + const int64_t ne03 = ne3 - lp3 - rp3; + + const int64_t i00 = wrap_coord(i0 - lp0, ne00); + const int64_t i01 = wrap_coord(i1 - lp1, ne01); + const int64_t i02 = wrap_coord(i2 - lp2, ne02); + const int64_t i03 = wrap_coord(i3 - lp3, ne03); const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00; dst[dst_idx] = src[src_idx]; - } else { - dst[dst_idx] = 0.0f; } } static void pad_f32_cuda(const float * src, float * dst, const int lp0, const int rp0, const int lp1, const int rp1, const int lp2, const int rp2, const int lp3, const int rp3, - const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { + const int ne0, const int ne1, const int ne2, const int ne3, + const int circular, cudaStream_t stream) { int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; dim3 gridDim(num_blocks, ne1, ne2*ne3); - pad_f32<<>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3); + pad_f32<<>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3, circular); } void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -65,8 +94,9 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int32_t rp2 = ((const int32_t*)(dst->op_params))[5]; const int32_t lp3 = ((const int32_t*)(dst->op_params))[6]; const int32_t rp3 = ((const int32_t*)(dst->op_params))[7]; + const int32_t circular = ((const int32_t*)(dst->op_params))[8]; pad_f32_cuda(src0_d, dst_d, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], circular, stream); } From 60bed3b95c653fc8fdebd5c429f8d5d50a37bf0d Mon Sep 17 00:00:00 2001 From: bepis Date: Mon, 3 Nov 2025 15:03:52 -0800 Subject: [PATCH 4/5] Added tests --- tests/test-backend-ops.cpp | 463 +++++++++++++++++++++++++++++++++++-- 1 file changed, 444 insertions(+), 19 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 967a53c63d86d..4e0546cb4d8f4 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4454,6 +4454,7 @@ struct test_conv_2d : public test_case { const int dilation1; // Whether the inputs are contiguous in the channel dim or the width dim const bool cwhn; + const bool circular; // If true, the direct CONV_2D will be used in the graph, otherwise it // uses ggml_conv_2d: @@ -4463,7 +4464,7 @@ struct test_conv_2d : public test_case { // IM2COL -> MUL_MM graph will be built. std::string vars() override { - return VARS_TO_STR10(ne_input, ne_kernel, type_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn); + return VARS_TO_STR11(ne_input, ne_kernel, type_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn, circular); } double max_nmse_err() override { @@ -4499,7 +4500,8 @@ struct test_conv_2d : public test_case { test_conv_2d(std::array ne_input = { 64, 64, 16, 1 }, std::array ne_kernel = { 3, 3, 1, 16 }, ggml_type type_kernel = GGML_TYPE_F32, int stride0 = 1, - int stride1 = 1, int padding0 = 0, int padding1 = 0, int dilation0 = 1, int dilation1 = 1, bool cwhn = false) : + int stride1 = 1, int padding0 = 0, int padding1 = 0, int dilation0 = 1, int dilation1 = 1, + bool cwhn = false, bool circular = false) : ne_input(ne_input), ne_kernel(ne_kernel), type_kernel(type_kernel), @@ -4509,7 +4511,8 @@ struct test_conv_2d : public test_case { padding1(padding1), dilation0(dilation0), dilation1(dilation1), - cwhn(cwhn) {} + cwhn(cwhn), + circular(circular) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); @@ -4527,8 +4530,58 @@ struct test_conv_2d : public test_case { kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1); } - ggml_tensor * out = - ggml_conv_2d_direct(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1); + ggml_tensor * out = circular + ? ggml_conv_2d_direct_circular(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1) + : ggml_conv_2d_direct(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1); + ggml_set_name(out, "out"); + return out; + } +}; + +struct test_conv_2d_im2col : public test_case { + const std::array ne_input; + const std::array ne_kernel; + const ggml_type type_kernel; + const int stride0; + const int stride1; + const int padding0; + const int padding1; + const int dilation0; + const int dilation1; + const bool circular; + + std::string vars() override { + return VARS_TO_STR10(ne_input, ne_kernel, type_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, circular); + } + + test_conv_2d_im2col(std::array ne_input = { 32, 24, 8, 2 }, + std::array ne_kernel = { 3, 3, 8, 4 }, + ggml_type type_kernel = GGML_TYPE_F32, + int stride0 = 1, int stride1 = 1, + int padding0 = 0, int padding1 = 0, + int dilation0 = 1, int dilation1 = 1, + bool circular = false) + : ne_input(ne_input), + ne_kernel(ne_kernel), + type_kernel(type_kernel), + stride0(stride0), + stride1(stride1), + padding0(padding0), + padding1(padding1), + dilation0(dilation0), + dilation1(dilation1), + circular(circular) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); + ggml_set_name(input, "input"); + + ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data()); + ggml_set_name(kernel, "kernel"); + + ggml_tensor * out = circular + ? ggml_conv_2d_circular(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1) + : ggml_conv_2d(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1); ggml_set_name(out, "out"); return out; } @@ -4542,15 +4595,16 @@ struct test_conv_2d_dw : public test_case { const int padding; const int dilation; const bool cwhn; + const bool circular; std::string vars() override { - return VARS_TO_STR6(ne_input, ne_kernel, stride, padding, dilation, cwhn); + return VARS_TO_STR7(ne_input, ne_kernel, stride, padding, dilation, cwhn, circular); } test_conv_2d_dw(std::array ne_input = {64, 64, 16, 1}, std::array ne_kernel = {3, 3, 1, 16}, - int stride = 1, int padding = 0, int dilation = 1, bool cwhn = false) - : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), padding(padding), dilation(dilation), cwhn(cwhn) {} + int stride = 1, int padding = 0, int dilation = 1, bool cwhn = false, bool circular = false) + : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), padding(padding), dilation(dilation), cwhn(cwhn), circular(circular) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); @@ -4568,14 +4622,270 @@ struct test_conv_2d_dw : public test_case { kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1); } - ggml_tensor * out = ggml_conv_2d_dw_direct( - ctx, kernel, input, - stride, stride, padding, padding, dilation, dilation); + ggml_tensor * out = circular + ? ggml_conv_2d_dw_direct_circular(ctx, kernel, input, + stride, stride, padding, padding, dilation, dilation) + : ggml_conv_2d_dw_direct(ctx, kernel, input, + stride, stride, padding, padding, dilation, dilation); ggml_set_name(out, "out"); return out; } }; +struct test_conv_2d_direct_circular_manual : public test_case { + const std::array ne_input{5, 4, 1, 1}; + const std::array ne_kernel{3, 3, 1, 1}; + const int stride0 = 1; + const int stride1 = 1; + const int padding0 = 2; + const int padding1 = 1; + const int dilation0 = 1; + const int dilation1 = 1; + + ggml_tensor * input = nullptr; + ggml_tensor * kernel = nullptr; + ggml_tensor * expected = nullptr; + + std::string vars() override { + return "manual_conv2d_direct_circular"; + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); + kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data()); + ggml_set_name(input, "input"); + ggml_set_name(kernel, "kernel"); + + ggml_tensor * actual = ggml_conv_2d_direct_circular( + ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1); + ggml_set_name(actual, "actual"); + + int64_t ne_out[4] = { + conv_out_size(ne_input[0], ne_kernel[0], stride0, padding0, dilation0), + conv_out_size(ne_input[1], ne_kernel[1], stride1, padding1, dilation1), + ne_kernel[3], + ne_input[3], + }; + + expected = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_out); + ggml_set_name(expected, "expected"); + + ggml_tensor * diff = ggml_sub(ctx, actual, expected); + ggml_tensor * sq = ggml_sqr(ctx, diff); + ggml_tensor * loss = ggml_sum(ctx, sq); + ggml_set_name(loss, "loss"); + return loss; + } + + void initialize_tensors(ggml_context * ctx) override { + test_case::initialize_tensors(ctx); + + std::vector input_data(ggml_nelements(input)); + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast(std::sin(static_cast(i + 1))); + } + ggml_backend_tensor_set(input, input_data.data(), 0, input_data.size() * sizeof(float)); + + std::vector kernel_data(ggml_nelements(kernel)); + for (size_t i = 0; i < kernel_data.size(); ++i) { + kernel_data[i] = static_cast(std::cos(static_cast(i + 1))); + } + ggml_backend_tensor_set(kernel, kernel_data.data(), 0, kernel_data.size() * sizeof(float)); + + int64_t ne_out[4] = { + conv_out_size(ne_input[0], ne_kernel[0], stride0, padding0, dilation0), + conv_out_size(ne_input[1], ne_kernel[1], stride1, padding1, dilation1), + ne_kernel[3], + ne_input[3], + }; + std::vector expected_data(ggml_nelements(expected), 0.0f); + + for (int64_t n = 0; n < ne_input[3]; ++n) { + for (int64_t oc = 0; oc < ne_kernel[3]; ++oc) { + for (int64_t oy = 0; oy < ne_out[1]; ++oy) { + for (int64_t ox = 0; ox < ne_out[0]; ++ox) { + float sum = 0.0f; + for (int64_t ic = 0; ic < ne_kernel[2]; ++ic) { + for (int64_t ky = 0; ky < ne_kernel[1]; ++ky) { + const int64_t in_y = wrap_coord_circular( + oy * stride1 + ky * dilation1 - padding1, ne_input[1]); + for (int64_t kx = 0; kx < ne_kernel[0]; ++kx) { + const int64_t in_x = wrap_coord_circular( + ox * stride0 + kx * dilation0 - padding0, ne_input[0]); + const int64_t src_idx = offset4d(ne_input.data(), in_x, in_y, ic, n); + const int64_t ker_idx = offset4d(ne_kernel.data(), kx, ky, ic, oc); + sum += input_data[src_idx] * kernel_data[ker_idx]; + } + } + } + expected_data[offset4d(ne_out, ox, oy, oc, n)] = sum; + } + } + } + } + + ggml_backend_tensor_set(expected, expected_data.data(), 0, expected_data.size() * sizeof(float)); + } + + double max_nmse_err() override { + return 1e-8; + } +}; + +struct test_conv_2d_dw_direct_circular_manual : public test_case { + const std::array ne_input{4, 3, 2, 1}; + const std::array ne_kernel{3, 2, 1, 2}; + const int stride = 1; + const int padding = 1; + const int dilation = 1; + + ggml_tensor * input = nullptr; + ggml_tensor * kernel = nullptr; + ggml_tensor * expected = nullptr; + + std::string vars() override { + return "manual_conv2d_dw_direct_circular"; + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); + kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data()); + ggml_set_name(input, "input"); + ggml_set_name(kernel, "kernel"); + + ggml_tensor * actual = ggml_conv_2d_dw_direct_circular( + ctx, kernel, input, stride, stride, padding, padding, dilation, dilation); + ggml_set_name(actual, "actual"); + + int64_t ne_out[4] = { + conv_out_size(ne_input[0], ne_kernel[0], stride, padding, dilation), + conv_out_size(ne_input[1], ne_kernel[1], stride, padding, dilation), + ne_input[2], + ne_input[3], + }; + expected = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_out); + ggml_set_name(expected, "expected"); + + ggml_tensor * diff = ggml_sub(ctx, actual, expected); + ggml_tensor * sq = ggml_sqr(ctx, diff); + ggml_tensor * loss = ggml_sum(ctx, sq); + ggml_set_name(loss, "loss"); + return loss; + } + + void initialize_tensors(ggml_context * ctx) override { + test_case::initialize_tensors(ctx); + + std::vector input_data(ggml_nelements(input)); + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast(i % 7); + } + ggml_backend_tensor_set(input, input_data.data(), 0, input_data.size() * sizeof(float)); + + std::vector kernel_data(ggml_nelements(kernel)); + for (size_t i = 0; i < kernel_data.size(); ++i) { + kernel_data[i] = static_cast((i % 5) - 2); + } + ggml_backend_tensor_set(kernel, kernel_data.data(), 0, kernel_data.size() * sizeof(float)); + + int64_t ne_out[4] = { + conv_out_size(ne_input[0], ne_kernel[0], stride, padding, dilation), + conv_out_size(ne_input[1], ne_kernel[1], stride, padding, dilation), + ne_input[2], + ne_input[3], + }; + + std::vector expected_data(ggml_nelements(expected), 0.0f); + for (int64_t n = 0; n < ne_input[3]; ++n) { + for (int64_t c = 0; c < ne_input[2]; ++c) { + for (int64_t oy = 0; oy < ne_out[1]; ++oy) { + for (int64_t ox = 0; ox < ne_out[0]; ++ox) { + float sum = 0.0f; + for (int64_t ky = 0; ky < ne_kernel[1]; ++ky) { + const int64_t in_y = wrap_coord_circular( + oy * stride + ky * dilation - padding, ne_input[1]); + for (int64_t kx = 0; kx < ne_kernel[0]; ++kx) { + const int64_t in_x = wrap_coord_circular( + ox * stride + kx * dilation - padding, ne_input[0]); + const int64_t src_idx = offset4d(ne_input.data(), in_x, in_y, c, n); + const int64_t ker_idx = offset4d(ne_kernel.data(), kx, ky, 0, c); + sum += input_data[src_idx] * kernel_data[ker_idx]; + } + } + expected_data[offset4d(ne_out, ox, oy, c, n)] = sum; + } + } + } + } + + ggml_backend_tensor_set(expected, expected_data.data(), 0, expected_data.size() * sizeof(float)); + } + + double max_nmse_err() override { + return 1e-8; + } +}; + +struct test_conv_2d_circular_pipeline : public test_case { + const std::array ne_input{6, 5, 3, 2}; + const std::array ne_kernel{3, 3, 3, 4}; + const int stride0 = 2; + const int stride1 = 1; + const int padding0 = 1; + const int padding1 = 2; + const int dilation0 = 1; + const int dilation1 = 2; + + ggml_tensor * input = nullptr; + ggml_tensor * kernel = nullptr; + + std::string vars() override { + return "conv2d_circular_vs_pipeline"; + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); + kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data()); + ggml_set_name(input, "input"); + ggml_set_name(kernel, "kernel"); + + ggml_tensor * actual = ggml_conv_2d_circular( + ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1); + ggml_set_name(actual, "actual"); + + ggml_tensor * padded = ggml_pad_ext_circular(ctx, input, padding0, padding0, padding1, padding1, 0, 0, 0, 0); + ggml_set_name(padded, "padded"); + ggml_tensor * reference = ggml_conv_2d(ctx, kernel, padded, stride0, stride1, 0, 0, dilation0, dilation1); + ggml_set_name(reference, "reference"); + + ggml_tensor * diff = ggml_sub(ctx, actual, reference); + ggml_tensor * sq = ggml_sqr(ctx, diff); + ggml_tensor * loss = ggml_sum(ctx, sq); + ggml_set_name(loss, "loss"); + return loss; + } + + void initialize_tensors(ggml_context * ctx) override { + test_case::initialize_tensors(ctx); + + std::vector input_data(ggml_nelements(input)); + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast(std::fmod(static_cast(i * 3 + 1), 17.0)); + } + ggml_backend_tensor_set(input, input_data.data(), 0, input_data.size() * sizeof(float)); + + std::vector kernel_data(ggml_nelements(kernel)); + for (size_t i = 0; i < kernel_data.size(); ++i) { + kernel_data[i] = static_cast(std::fmod(static_cast(i * 7 + 3), 11.0) - 5.0); + } + ggml_backend_tensor_set(kernel, kernel_data.data(), 0, kernel_data.size() * sizeof(float)); + } + + double max_nmse_err() override { + return 1e-8; + } +}; + // GGML_OP_CONV_3D struct test_conv_3d : public test_case { // Logical 5D dimensions @@ -5288,26 +5598,43 @@ struct test_acc : public test_case { }; // GGML_OP_PAD +static inline int64_t wrap_coord_circular(int64_t coord, int64_t size) { + GGML_ASSERT(size > 0); + const int64_t mod = coord % size; + return mod < 0 ? mod + size : mod; +} + +static inline int64_t offset4d(const int64_t ne[4], int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + return ((i3 * ne[2] + i2) * ne[1] + i1) * ne[0] + i0; +} + +static inline int64_t conv_out_size(int64_t ins, int64_t ks, int stride, int pad, int dilation) { + return (ins + 2 * pad - dilation * (ks - 1) - 1) / stride + 1; +} + struct test_pad : public test_case { const ggml_type type; const std::array ne_a; const int pad_0; const int pad_1; + const bool circular; std::string vars() override { - return VARS_TO_STR4(type, ne_a, pad_0, pad_1); + return VARS_TO_STR5(type, ne_a, pad_0, pad_1, circular); } test_pad(ggml_type type = GGML_TYPE_F32, std::array ne_a = {512, 512, 1, 1}, - int pad_0 = 1, int pad_1 = 1) - : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {} + int pad_0 = 1, int pad_1 = 1, bool circular = false) + : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1), circular(circular) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); ggml_set_name(a, "a"); - ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0); + ggml_tensor * out = circular + ? ggml_pad_circular(ctx, a, pad_0, pad_1, 0, 0) + : ggml_pad(ctx, a, pad_0, pad_1, 0, 0); ggml_set_name(out, "out"); return out; @@ -5326,17 +5653,19 @@ struct test_pad_ext : public test_case { const int lp3; const int rp3; const bool v; + const bool circular; std::string vars() override { - return VARS_TO_STR11(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, v); + return VARS_TO_STR12(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, v, circular); } test_pad_ext(ggml_type type = GGML_TYPE_F32, std::array ne_a = {512, 512, 3, 1}, int lp0 = 1, int rp0 = 1, int lp1 = 1, int rp1 = 1, int lp2 = 1, int rp2 = 1, int lp3 = 1, int rp3 = 1, - bool v = false) - : type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3), v(v) {} + bool v = false, bool circular = false) + : type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3), + v(v), circular(circular) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); @@ -5347,7 +5676,9 @@ struct test_pad_ext : public test_case { ggml_set_name(a, "view of a"); } - ggml_tensor * out = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + ggml_tensor * out = circular + ? ggml_pad_ext_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3) + : ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); ggml_set_name(out, "out"); return out; @@ -5355,6 +5686,83 @@ struct test_pad_ext : public test_case { }; // GGML_OP_PAD_REFLECT_1D + +struct test_pad_ext_circular_manual : public test_case { + const std::array ne_src{4, 3, 1, 1}; + const std::array pads_l{1, 2, 0, 0}; + const std::array pads_r{2, 1, 0, 0}; + + ggml_tensor * input = nullptr; + ggml_tensor * expected = nullptr; + + std::string vars() override { + return "manual_pad_ext_circular"; + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_src.data()); + ggml_set_name(input, "input"); + + ggml_tensor * actual = ggml_pad_ext_circular(ctx, input, + pads_l[0], pads_r[0], pads_l[1], pads_r[1], pads_l[2], pads_r[2], pads_l[3], pads_r[3]); + ggml_set_name(actual, "actual"); + + int64_t ne_dst[4] = { + ne_src[0] + pads_l[0] + pads_r[0], + ne_src[1] + pads_l[1] + pads_r[1], + ne_src[2] + pads_l[2] + pads_r[2], + ne_src[3] + pads_l[3] + pads_r[3], + }; + + expected = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_dst); + ggml_set_name(expected, "expected"); + + ggml_tensor * diff = ggml_sub(ctx, actual, expected); + ggml_tensor * sq = ggml_sqr(ctx, diff); + ggml_tensor * loss = ggml_sum(ctx, sq); + ggml_set_name(loss, "loss"); + return loss; + } + + void initialize_tensors(ggml_context * ctx) override { + test_case::initialize_tensors(ctx); + + std::vector src_data(ggml_nelements(input)); + for (size_t i = 0; i < src_data.size(); ++i) { + src_data[i] = static_cast(i + 1); + } + ggml_backend_tensor_set(input, src_data.data(), 0, src_data.size() * sizeof(float)); + + int64_t ne_dst[4] = { + ne_src[0] + pads_l[0] + pads_r[0], + ne_src[1] + pads_l[1] + pads_r[1], + ne_src[2] + pads_l[2] + pads_r[2], + ne_src[3] + pads_l[3] + pads_r[3], + }; + + std::vector exp_data(ggml_nelements(expected)); + for (int64_t i3 = 0; i3 < ne_dst[3]; ++i3) { + for (int64_t i2 = 0; i2 < ne_dst[2]; ++i2) { + for (int64_t i1 = 0; i1 < ne_dst[1]; ++i1) { + for (int64_t i0 = 0; i0 < ne_dst[0]; ++i0) { + const int64_t src_i0 = wrap_coord_circular(i0 - pads_l[0], ne_src[0]); + const int64_t src_i1 = wrap_coord_circular(i1 - pads_l[1], ne_src[1]); + const int64_t src_i2 = wrap_coord_circular(i2 - pads_l[2], ne_src[2]); + const int64_t src_i3 = wrap_coord_circular(i3 - pads_l[3], ne_src[3]); + exp_data[offset4d(ne_dst, i0, i1, i2, i3)] = + src_data[offset4d(ne_src.data(), src_i0, src_i1, src_i2, src_i3)]; + } + } + } + } + ggml_backend_tensor_set(expected, exp_data.data(), 0, exp_data.size() * sizeof(float)); + } + + double max_nmse_err() override { + return 1e-8; + } +}; + struct test_pad_reflect_1d : public test_case { const ggml_type type; const std::array ne_a; @@ -6477,10 +6885,23 @@ static std::vector> make_test_cases_eval() { // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true)); // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true)); + test_cases.emplace_back(new test_conv_2d({37, 29, 3, 2}, {3, 2, 3, 5}, GGML_TYPE_F32, 1, 1, 4, 3, 1, 1, false, true)); + test_cases.emplace_back(new test_conv_2d({19, 23, 4, 1}, {5, 3, 4, 4}, GGML_TYPE_F16, 2, 1, 3, 2, 1, 2, false, true)); + test_cases.emplace_back(new test_conv_2d({16, 18, 6, 3}, {3, 3, 6, 8}, GGML_TYPE_F32, 1, 2, 2, 3, 1, 1, true, true)); + + test_cases.emplace_back(new test_conv_2d_im2col()); + test_cases.emplace_back(new test_conv_2d_im2col({17, 13, 6, 2}, {3, 3, 6, 4}, GGML_TYPE_F32, 1, 2, 2, 3, 1, 1, true)); + test_cases.emplace_back(new test_conv_2d_im2col({11, 7, 2, 1}, {3, 3, 2, 3}, GGML_TYPE_F16, 1, 1, 1, 1, 2, 1, true)); + test_cases.emplace_back(new test_conv_2d_direct_circular_manual()); + test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, false)); test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, true)); test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false)); test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true)); + test_cases.emplace_back(new test_conv_2d_dw({29, 19, 8, 2}, {5, 3, 1, 8}, 1, 2, 1, false, true)); + test_cases.emplace_back(new test_conv_2d_dw({24, 14, 16, 1}, {3, 3, 1, 16}, 2, 1, 2, true, true)); + test_cases.emplace_back(new test_conv_2d_dw_direct_circular_manual()); + test_cases.emplace_back(new test_conv_2d_circular_pipeline()); // CONV_3D auto calc_conv_output_size_3d = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t { @@ -7214,7 +7635,10 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {9, 9, 1280, 1})); test_cases.emplace_back(new test_acc()); test_cases.emplace_back(new test_pad()); + test_cases.emplace_back(new test_pad_ext_circular_manual()); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); test_cases.emplace_back(new test_pad_ext()); + test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {19, 11, 5, 2}, 2, 4, 1, 3, 0, 0, 0, 0, false, true)); test_cases.emplace_back(new test_pad_reflect_1d()); test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1})); test_cases.emplace_back(new test_roll()); @@ -7225,6 +7649,7 @@ static std::vector> make_test_cases_eval() { for (bool v : {false, true}) { test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, v)); test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {11, 22, 33, 44}, 1, 2, 3, 4, 5, 6, 7, 8, v)); + test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {23, 17, 7, 3}, 2, 1, 3, 0, 1, 2, 0, 0, v, true)); } for (int hsk : { 40, 64, 72, 80, 96, 128, 192, 256, 576 }) { From 5700a4e7e92200a3119d36a4719390c787cc1831 Mon Sep 17 00:00:00 2001 From: bepis Date: Mon, 3 Nov 2025 15:37:57 -0800 Subject: [PATCH 5/5] Added tests --- tests/test-backend-ops.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 4e0546cb4d8f4..29b2a77b60e0a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4632,6 +4632,20 @@ struct test_conv_2d_dw : public test_case { } }; +static inline int64_t conv_out_size(int64_t ins, int64_t ks, int stride, int pad, int dilation) { + return (ins + 2 * pad - dilation * (ks - 1) - 1) / stride + 1; +} +// GGML_OP_PAD +static inline int64_t wrap_coord_circular(int64_t coord, int64_t size) { + GGML_ASSERT(size > 0); + const int64_t mod = coord % size; + return mod < 0 ? mod + size : mod; +} + +static inline int64_t offset4d(const int64_t ne[4], int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + return ((i3 * ne[2] + i2) * ne[1] + i1) * ne[0] + i0; +} + struct test_conv_2d_direct_circular_manual : public test_case { const std::array ne_input{5, 4, 1, 1}; const std::array ne_kernel{3, 3, 1, 1}; @@ -5597,20 +5611,7 @@ struct test_acc : public test_case { } }; -// GGML_OP_PAD -static inline int64_t wrap_coord_circular(int64_t coord, int64_t size) { - GGML_ASSERT(size > 0); - const int64_t mod = coord % size; - return mod < 0 ? mod + size : mod; -} -static inline int64_t offset4d(const int64_t ne[4], int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - return ((i3 * ne[2] + i2) * ne[1] + i1) * ne[0] + i0; -} - -static inline int64_t conv_out_size(int64_t ins, int64_t ks, int stride, int pad, int dilation) { - return (ins + 2 * pad - dilation * (ks - 1) - 1) / stride + 1; -} struct test_pad : public test_case { const ggml_type type;