From f6ac08424f490b70b94057deb28ab70e8ebc549c Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Mon, 3 Nov 2025 13:27:40 -0800
Subject: [PATCH 1/5] Feat: Added vulkan circular tiling support

---
 ggml/include/ggml.h                           |  82 +++++++++++
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          |  12 ++
 .../ggml-vulkan/vulkan-shaders/conv2d_dw.comp |  76 ++++++++---
 .../ggml-vulkan/vulkan-shaders/conv2d_mm.comp |  17 ++-
 ggml/src/ggml-vulkan/vulkan-shaders/pad.comp  |  26 +++-
 ggml/src/ggml.c                               | 127 +++++++++++++++++-
 6 files changed, 310 insertions(+), 30 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 2311cdabe3ba4..bb96e1bb5dbb8 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1943,6 +1943,18 @@ extern "C" {
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
 
+
+    GGML_API struct ggml_tensor * ggml_conv_2d_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
+
     GGML_API struct ggml_tensor * ggml_im2col_3d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -2016,6 +2028,19 @@ extern "C" {
             int                  d0,  // dilation dimension 0
             int                  d1); // dilation dimension 1
 
+
+    // depthwise (via im2col and mul_mat)
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1
+
     // Depthwise 2D convolution
     // may be faster than ggml_conv_2d_dw, but not available in all backends
     // a:   KW    KH    1    C    convolution kernel
@@ -2032,12 +2057,35 @@ extern "C" {
             int                   dilation0,
             int                   dilation1);
 
+    // Depthwise 2D convolution (on a torus)
+    // may be faster than ggml_conv_2d_dw, but not available in all backends
+    // a:   KW    KH    1    C    convolution kernel
+    // b:   W     H     C    N    input data
+    // res: W_out H_out C    N
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   stride0,
+            int                   stride1,
+            int                   pad0,
+            int                   pad1,
+            int                   dilation0,
+            int                   dilation1);
+
     GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             int                   stride);
 
+    // circular (on a torus)
+    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   stride);
+
     GGML_API struct ggml_tensor * ggml_conv_2d_direct(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC]
@@ -2048,6 +2096,17 @@ extern "C" {
             int                   p1,  // padding dimension 1
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
+        
+    GGML_API struct ggml_tensor * ggml_conv_2d_direct_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC]
+            struct ggml_tensor  * b,   // input data [W, H, C, N]
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
 
     GGML_API struct ggml_tensor * ggml_conv_3d_direct(
             struct ggml_context * ctx,
@@ -2156,6 +2215,15 @@ extern "C" {
             int                  p2,
             int                  p3);
 
+    // pad each dimension with values on the other side of the torus (looping around)
+    GGML_API struct ggml_tensor * ggml_pad_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  p0,
+            int                  p1,
+            int                  p2,
+            int                  p3);
+
     GGML_API struct ggml_tensor * ggml_pad_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -2169,6 +2237,20 @@ extern "C" {
             int                  rp3
             );
 
+    // circular padding
+    GGML_API struct ggml_tensor * ggml_pad_ext_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  lp0,
+            int                  rp0,
+            int                  lp1,
+            int                  rp1,
+            int                  lp2,
+            int                  rp2,
+            int                  lp3,
+            int                  rp3
+            );
+
     // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
     GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
             struct ggml_context * ctx,
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 8d1a85c96939b..69b58e32950a5 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -940,6 +940,7 @@ struct vk_op_pad_push_constants {
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
     uint32_t misalign_offsets;
+    uint32_t circular;
 
     uint32_t lp0; uint32_t rp0;
     uint32_t lp1; uint32_t rp1;
@@ -982,6 +983,7 @@ static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor
     p.rp2 = dst->op_params[5];
     p.lp3 = dst->op_params[6];
     p.rp3 = dst->op_params[7];
+    p.circular = dst->op_params[8];
 
     return p; // fastdiv values and offsets are initialized later in ggml_vk_op
 }
@@ -1249,6 +1251,8 @@ struct vk_op_conv2d_push_constants {
     uint32_t KWKHmp; uint32_t KWKHL;
     uint32_t OWmp;   uint32_t OWL;
     uint32_t OWOHmp; uint32_t OWOHL;
+
+    uint32_t circular;
 };
 
 template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
@@ -1297,6 +1301,8 @@ struct vk_op_conv_transpose_2d_push_constants {
     uint32_t OWOHmp; uint32_t OWOHL;
     uint32_t s0mp; uint32_t s0L;
     uint32_t s1mp; uint32_t s1L;
+
+    uint32_t circular;
 };
 
 template <> void init_pushconst_fastdiv(vk_op_conv_transpose_2d_push_constants &p) {
@@ -1325,6 +1331,7 @@ struct vk_op_conv2d_dw_push_constants {
     int32_t pad_y;
     int32_t dilation_x;
     int32_t dilation_y;
+    uint32_t circular;
 };
 
 struct vk_op_upscale_push_constants {
@@ -10420,6 +10427,8 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
 
+    p.circular = static_cast<uint32_t>(dst->op_params[6]);
+
     GGML_ASSERT(ne03 == ne2);
     GGML_ASSERT(ne02 == ne12);
 
@@ -10469,6 +10478,8 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
 
+    p.circular = static_cast<uint32_t>(dst->op_params[1]);
+
     GGML_ASSERT(ne02 == ne2);
     GGML_ASSERT(ne03 == ne12);
 
@@ -10492,6 +10503,7 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx
     p.pad_y = dst->op_params[3];
     p.dilation_x = dst->op_params[4];
     p.dilation_y = dst->op_params[5];
+    p.circular = dst->op_params[6];
 
     GGML_ASSERT(src0->ne[3] == p.channels);
     GGML_ASSERT(src1->ne[3] == p.batches);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
index 70a301488eb1d..88bd1d7a755f6 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
@@ -19,6 +19,7 @@ layout (push_constant) uniform parameter
     int pad_y;
     int dilation_x;
     int dilation_y;
+    uint circular;
 } p;
 
 layout (binding = 0) readonly buffer A {A_TYPE knl_data[];};
@@ -27,6 +28,10 @@ layout (binding = 2) writeonly buffer D {D_TYPE dst_data[];};
 
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
+uint32_t wrap_coord(int coord, uint32_t size) {
+    return uint32_t((uint(coord + int(size))) % size);
+}
+
 FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
     uint i0 = idx / p.dst_w;
     uint dst_x = idx - i0 * p.dst_w;
@@ -39,19 +44,35 @@ FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
     uint knl_i = c * p.knl_h * p.knl_w;
 
     FLOAT_TYPE sum = 0.0;
-    for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-        uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-        if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
-            continue;
+
+    if (p.circular != 0u) {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
+            uint src_y  = wrap_coord(raw_y, p.src_h);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                int raw_x   = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x;
+                uint src_x  = wrap_coord(raw_x, p.src_w);
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
+                sum = fma(v, k, sum);
+            }
         }
-        for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-            uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-            if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+    }
+    else {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+            if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
                 continue;
             }
-            FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
-            FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
-            sum = fma(v, k, sum);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+                    continue;
+                }
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
+                sum = fma(v, k, sum);
+            }
         }
     }
     return sum;
@@ -70,19 +91,34 @@ FLOAT_TYPE conv_2d_dw_cwhn(uint idx) {
     uint knl_row = p.knl_w * p.channels;
 
     FLOAT_TYPE sum = 0.0;
-    for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-        uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-        if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
-            continue;
+    if (p.circular != 0u) {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
+            uint src_y  = wrap_coord(raw_y, p.src_h);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                int raw_x   = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x;
+                uint src_x  = wrap_coord(raw_x, p.src_w);
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
+                sum = fma(v, k, sum);
+            }
         }
-        for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-            uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-            if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+    }
+    else {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+            if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
                 continue;
             }
-            FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
-            FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
-            sum = fma(v, k, sum);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+                    continue;
+                }
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
+                sum = fma(v, k, sum);
+            }
         }
     }
     return sum;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
index 0367e80bbfa73..c18fd92ccaf9a 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -70,6 +70,8 @@ layout(push_constant) uniform parameter {
     uint32_t s0mp; uint32_t s0L;
     uint32_t s1mp; uint32_t s1L;
 #endif
+
+    uint32_t circular;
 }
 
 p;
@@ -174,6 +176,10 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T
 }
 #endif
 
+uint32_t wrap_coord(int coord, uint32_t size) {
+    return uint32_t((uint(coord + int(size))) % size);
+}
+
 void main() {
 #ifdef COOPMAT2
     coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
@@ -274,7 +280,8 @@ void main() {
             KH_idx_b               = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
             KW_idx_b               = CRS_remainder - KH_idx_b * p.KW;
 #endif
-
+            uint32_t H_pos;
+            uint32_t W_pos;
 #ifdef TRANSPOSE
             uint32_t H_idx_x_s1 = OH_idx - KH_idx_b * p.d1 + p.p1;
             uint32_t W_idx_x_s0 = OW_idx - KW_idx_b * p.d0 + p.p0;
@@ -284,13 +291,15 @@ void main() {
             uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1;
             uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0;
 #endif
+            H_pos            = (p.circular != 0) ? wrap_coord(int(H_idx), p.H) : H_idx;
+            W_pos            = (p.circular != 0) ? wrap_coord(int(W_idx), p.W) : W_idx;
             uint32_t src_idx =
-                min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
+                min(max(W_pos + H_pos * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
             float val = src_data[src_idx];
             if (CRS_idx_b >= CRS || NPQ_idx >= NPQ
-                || H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
+                || H_pos >= p.H || W_pos >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
 #ifdef TRANSPOSE
-                || (H_idx_x_s1 - H_idx * p.s1 != 0) || (W_idx_x_s0 - W_idx * p.s0 != 0)
+                || (H_idx_x_s1 - H_pos * p.s1 != 0) || (W_idx_x_s0 - W_pos * p.s0 != 0)
 #endif
                 ) {
                 val = 0.0;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
index f3c8176872758..f2fd5929bf41d 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
@@ -8,6 +8,7 @@ layout (push_constant) uniform parameter
     uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
     uint misalign_offsets;
+    uint circular;
 
     uint lp0; uint rp0;
     uint lp1; uint rp1;
@@ -18,6 +19,10 @@ layout (push_constant) uniform parameter
 uint get_aoffset() { return p.misalign_offsets >> 16; }
 uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
 
+uint wrap_coord(int coord, uint size) {
+    return (uint(coord + int(size))) % size; // add size to avoid issues with negative
+}
+
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 
@@ -40,10 +45,21 @@ void main() {
     const uint src0_idx = (i3 - p.lp3)*p.nb03 + (i2 - p.lp2)*p.nb02 + (i1 - p.lp1)*p.nb01 + (i0 - p.lp0)*p.nb00;
     const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
 
-    const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 &&
-                         i1 >= p.lp1 && i1 < p.ne11 - p.rp1 &&
-                         i2 >= p.lp2 && i2 < p.ne12 - p.rp2 &&
-                         i3 >= p.lp3 && i3 < p.ne13 - p.rp3;
+    if (p.circular != 0u) {
+        const uint ci0 = wrap_coord(int(i0) - int(p.lp0), p.ne00);
+        const uint ci1 = wrap_coord(int(i1) - int(p.lp1), p.ne01);
+        const uint ci2 = wrap_coord(int(i2) - int(p.lp2), p.ne02);
+        const uint ci3 = wrap_coord(int(i3) - int(p.lp3), p.ne03);
+        const uint circular_src_idx = ci3*p.nb03 + ci2*p.nb02 + ci1*p.nb01 + ci0*p.nb00;
+        data_d[get_doffset() + dst_idx] = D_TYPE(data_a[get_aoffset() + circular_src_idx]);
+    }
+    else {
+        const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 &&
+                            i1 >= p.lp1 && i1 < p.ne11 - p.rp1 &&
+                            i2 >= p.lp2 && i2 < p.ne12 - p.rp2 &&
+                            i3 >= p.lp3 && i3 < p.ne13 - p.rp3;
+        data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
+    }
+    
 
-    data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 9be35c1be8456..1113164f48865 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4457,6 +4457,29 @@ struct ggml_tensor * ggml_conv_2d(
     return result;
 }
 
+
+// ggml_conv_2d_circular
+
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OC, OH, OW]
+struct ggml_tensor * ggml_conv_2d_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    if (p0 == 0 && p1 == 0) {
+        return ggml_conv_2d(ctx, a, b, s0, s1, p0, p1, d0, d1);
+    }
+    struct ggml_tensor * b_padded = ggml_pad_ext_circular(ctx, b, p0, p0, p1, p1, 0, 0, 0, 0);
+    return ggml_conv_2d(ctx, a, b_padded, s0, s1, 0, 0, d0, d1);
+}
+
 // a: [OC*IC, KD, KH, KW]
 // b: [N*IC, ID, IH, IW]
 // result: [N*OD, OH, OW, IC * KD * KH * KW]
@@ -4585,6 +4608,25 @@ struct ggml_tensor * ggml_conv_2d_dw(
     return result;
 }
 
+// ggml_conv_2d_dw_circular
+
+struct ggml_tensor * ggml_conv_2d_dw_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    if (p0 == 0 && p1 == 0) {
+        return ggml_conv_2d_dw(ctx, a, b, s0, s1, p0, p1, d0, d1);
+    }
+    struct ggml_tensor * b_padded = ggml_pad_ext_circular(ctx, b, p0, p0, p1, p1, 0, 0, 0, 0);
+    return ggml_conv_2d_dw(ctx, a, b_padded, s0, s1, 0, 0, d0, d1);
+}
+
 // ggml_conv_2d_dw_direct
 
 struct ggml_tensor * ggml_conv_2d_dw_direct(
@@ -4616,7 +4658,9 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
         result->nb[2] = type_size;
     }
 
-    int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
+    int circular = 0; // default not circular
+
+    int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1, circular };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_CONV_2D_DW;
@@ -4625,6 +4669,24 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
     return result;
 }
 
+// ggml_conv_2d_dw_direct_circular
+
+struct ggml_tensor * ggml_conv_2d_dw_direct_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   stride0,
+        int                   stride1,
+        int                   pad0,
+        int                   pad1,
+        int                   dilation0,
+        int                   dilation1) {
+    struct ggml_tensor * result =
+        ggml_conv_2d_dw_direct(ctx, a, b, stride0, stride1, pad0, pad1, dilation0, dilation1);
+    ggml_set_op_params_i32(result, 6, 1); // circular
+    return result;
+}
+
 // ggml_conv_2d_direct
 
 struct ggml_tensor * ggml_conv_2d_direct(
@@ -4655,6 +4717,7 @@ struct ggml_tensor * ggml_conv_2d_direct(
     ggml_set_op_params_i32(result, 3, p1);
     ggml_set_op_params_i32(result, 4, d0);
     ggml_set_op_params_i32(result, 5, d1);
+    ggml_set_op_params_i32(result, 6, 0); // default not circularc
 
     result->op = GGML_OP_CONV_2D;
     result->src[0] = a;
@@ -4663,6 +4726,23 @@ struct ggml_tensor * ggml_conv_2d_direct(
     return result;
 }
 
+// ggml_conv_2d_direct_circular
+
+struct ggml_tensor * ggml_conv_2d_direct_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct ggml_tensor * result = ggml_conv_2d_direct(ctx, a, b, s0, s1, p0, p1, d0, d1);
+    ggml_set_op_params_i32(result, 6, 1); // circular
+    return result;
+}
+
 // ggml_conv_3d_direct
 
 struct ggml_tensor * ggml_conv_3d_direct(
@@ -4735,6 +4815,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
     struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
     ggml_set_op_params_i32(result, 0, stride);
+    ggml_set_op_params_i32(result, 1, 0); // circular default off
 
     result->op     = GGML_OP_CONV_TRANSPOSE_2D;
     result->src[0] = a;
@@ -4743,6 +4824,18 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
     return result;
 }
 
+// ggml_conv_transpose_2d_p0_circular
+
+struct ggml_tensor * ggml_conv_transpose_2d_p0_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   stride) {
+    struct ggml_tensor * result = ggml_conv_transpose_2d_p0(ctx, a, b, stride);
+    ggml_set_op_params_i32(result, 1, 1); // circular enabled
+    return result;
+}
+
 // ggml_pool_*
 
 static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
@@ -4894,6 +4987,18 @@ struct ggml_tensor * ggml_pad(
     return ggml_pad_ext(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
 }
 
+// ggml_pad_circular
+
+struct ggml_tensor * ggml_pad_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   p0,
+        int                   p1,
+        int                   p2,
+        int                   p3) {
+    return ggml_pad_ext_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
+}
+
 struct ggml_tensor * ggml_pad_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -4920,6 +5025,7 @@ struct ggml_tensor * ggml_pad_ext(
     ggml_set_op_params_i32(result, 5, rp2);
     ggml_set_op_params_i32(result, 6, lp3);
     ggml_set_op_params_i32(result, 7, rp3);
+    ggml_set_op_params_i32(result, 8, 0); // not circular by default
 
 
     result->op     = GGML_OP_PAD;
@@ -4928,6 +5034,25 @@ struct ggml_tensor * ggml_pad_ext(
     return result;
 }
 
+// ggml_pad_ext_circular
+
+struct ggml_tensor * ggml_pad_ext_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                  lp0,
+        int                  rp0,
+        int                  lp1,
+        int                  rp1,
+        int                  lp2,
+        int                  rp2,
+        int                  lp3,
+        int                  rp3
+        ) {
+    struct ggml_tensor * result = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+    ggml_set_op_params_i32(result, 8, 1); // circular
+    return result;
+}
+
 // ggml_pad_reflect_1d
 
 struct ggml_tensor * ggml_pad_reflect_1d(

From d7f5958b9e3f3553e211e9cdab83f3c222c504fa Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Mon, 3 Nov 2025 13:50:52 -0800
Subject: [PATCH 2/5] Feat: Added cpu circular

---
 ggml/src/ggml-cpu/ops.cpp | 324 +++++++++++++++++++++++++++-----------
 1 file changed, 232 insertions(+), 92 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index f66d36ff62c03..92fd172769abc 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6674,6 +6674,10 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params
     ggml_compute_forward_mul_mat(params, &dst);
 }
 
+static inline int64_t ggml_wrap_coord(int64_t coord, int64_t size) {
+    return (coord  + size) % size; // adding size avoids negative number weirdness
+}
+
 // ggml_compute_forward_conv_2d
 
 static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
@@ -6694,6 +6698,7 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
     const int32_t pad_y      = dst->op_params[3];
     const int32_t dilation_x = dst->op_params[4];
     const int32_t dilation_y = dst->op_params[5];
+    const bool circular      = dst->op_params[6];
 
     const int64_t c_in  = src->ne[2];
     const int64_t c_out = kernel->ne[3];
@@ -6734,40 +6739,73 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
         const int64_t patch_end         = std::min(patch_start + patch_per_thread, patch_end_batch);
 
         //im2col for a patch
-        for (int64_t p = patch_start; p < patch_end; ++p) {
-            const int64_t  batch_n     =  p / (dst_w * dst_h);
-            const int64_t  src_x       = (p / dst_w) % dst_h;
-            const int64_t  src_y       =  p % dst_w;
+        if (circular == 0) {
+            for (int64_t p = patch_start; p < patch_end; ++p) {
+                const int64_t  batch_n     =  p / (dst_w * dst_h);
+                const int64_t  src_x       = (p / dst_w) % dst_h;
+                const int64_t  src_y       =  p % dst_w;
 
-            const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]);
-            char *        dst_row  = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size;
+                const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]);
+                char *        dst_row  = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size;
 
-            for (int64_t ic = 0; ic < c_in; ++ic) {
-                for (int64_t ky = 0; ky < knl_h; ++ky) {
-                    for (int64_t kx = 0; kx < knl_w; ++kx) {
-                        const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y;
-                        const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x;
+                for (int64_t ic = 0; ic < c_in; ++ic) {
+                    for (int64_t ky = 0; ky < knl_h; ++ky) {
+                        for (int64_t kx = 0; kx < knl_w; ++kx) {
+                            const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y;
+                            const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x;
 
-                        int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
+                            int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
 
-                        float src_val;
-                        if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
-                            src_val = 0.0f;
-                        } else {
-                            const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
-                            src_val               = *src_ptr;
+                            float src_val;
+                            if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
+                                src_val = 0.0f;
+                            } else {
+                                const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
+                                src_val               = *src_ptr;
+                            }
+
+                            char * element_ptr = dst_row + dst_idx * traits->type_size;
+                            if (kernel_type == GGML_TYPE_F32) {
+                                *(float *) element_ptr = src_val;
+                            } else if (kernel_type == GGML_TYPE_F16) {
+                                *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val);
+                            }
                         }
+                    }
+                }
+            }   // patches handled by this thread
+        }
+        else {
+            for (int64_t p = patch_start; p < patch_end; ++p) {
+                const int64_t  batch_n     =  p / (dst_w * dst_h);
+                const int64_t  src_x       = (p / dst_w) % dst_h;
+                const int64_t  src_y       =  p % dst_w;
+
+                const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]);
+                char *        dst_row  = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size;
+
+                for (int64_t ic = 0; ic < c_in; ++ic) {
+                    for (int64_t ky = 0; ky < knl_h; ++ky) {
+                        for (int64_t kx = 0; kx < knl_w; ++kx) {
+                            const int64_t sy = ggml_wrap_coord(src_x * stride_y + ky * dilation_y - pad_y, src_h);
+                            const int64_t sx = ggml_wrap_coord(src_y * stride_x + kx * dilation_x - pad_x, src_w);
+
+                            int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
 
-                        char * element_ptr = dst_row + dst_idx * traits->type_size;
-                        if (kernel_type == GGML_TYPE_F32) {
-                            *(float *) element_ptr = src_val;
-                        } else if (kernel_type == GGML_TYPE_F16) {
-                            *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val);
+                            const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
+                            float src_val = *src_ptr;
+                            char * element_ptr = dst_row + dst_idx * traits->type_size;
+                            if (kernel_type == GGML_TYPE_F32) {
+                                *(float *) element_ptr = src_val;
+                            } else if (kernel_type == GGML_TYPE_F16) {
+                                *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val);
+                            }
                         }
                     }
                 }
-            }
-        }   // patches handled by this thread
+            }   // patches handled by this thread
+        }
+
 
         ggml_barrier(params->threadpool);
 
@@ -7066,6 +7104,7 @@ struct ggml_conv_2d_dw_params {
     int pad_y;
     int dilation_x;
     int dilation_y;
+    int circular;
 };
 
 static void ggml_compute_forward_conv_2d_dw_cwhn(
@@ -7091,57 +7130,103 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
     const int64_t c_pkg_end = 0;
 #endif
 
-    for (int64_t row = row_start; row < row_end; ++row) {
-        const int64_t dst_y = row % p.dst_h;
-        const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
-        for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
-            float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
-            const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
-            const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
+    const int64_t circular = p.circular;
 
-#ifdef GGML_SIMD
-            // Vectorized loop
-            for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
-                GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
-                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
-                    if (src_y < 0 || src_y >= p.src_h) {
-                        continue;
+    if (circular == 0) {
+        for (int64_t row = row_start; row < row_end; ++row) {
+            const int64_t dst_y = row % p.dst_h;
+            const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
+            for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
+                float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
+                const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
+                const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
+
+    #ifdef GGML_SIMD
+                // Vectorized loop
+                for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
+                    GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
+                    for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                        const int64_t src_y = src_y_base + knl_y * p.dilation_y;
+                        if (src_y < 0 || src_y >= p.src_h) {
+                            continue;
+                        }
+                        for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                            const int64_t src_x = src_x_base + knl_x * p.dilation_x;
+                            if (src_x < 0 || src_x >= p.src_w) {
+                                continue;
+                            }
+                            GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
+                            GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
+                            sum = GGML_F32_VEC_FMA(sum, k, s);
+                        }
                     }
-                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
-                        if (src_x < 0 || src_x >= p.src_w) {
+                    GGML_F32_VEC_STORE(dst_data + c_i, sum);
+                }
+    #endif
+                // Scalar loop
+                for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
+                    float sum = 0.0f;
+                    for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                        const int64_t src_y = src_y_base + knl_y * p.dilation_y;
+                        if (src_y < 0 || src_y >= p.src_h) {
                             continue;
                         }
-                        GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
-                        GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
-                        sum = GGML_F32_VEC_FMA(sum, k, s);
+                        for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                            const int64_t src_x = src_x_base + knl_x * p.dilation_x;
+                            if (src_x < 0 || src_x >= p.src_w) {
+                                continue;
+                            }
+                            sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
+                                * src_data[(src_y * p.src_w + src_x) * c + c_i];
+                        }
                     }
+                    dst_data[c_i] = sum;
                 }
-                GGML_F32_VEC_STORE(dst_data + c_i, sum);
             }
-#endif
-            // Scalar loop
-            for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
-                float sum = 0.0f;
-                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
-                    if (src_y < 0 || src_y >= p.src_h) {
-                        continue;
+        }
+    }
+    else {
+        for (int64_t row = row_start; row < row_end; ++row) {
+            const int64_t dst_y = row % p.dst_h;
+            const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
+            for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
+                float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
+                const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
+                const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
+
+    #ifdef GGML_SIMD
+                // Vectorized loop
+                for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
+                    GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
+                    for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                        const int64_t src_y = ggml_wrap_coord(src_y_base + knl_y * p.dilation_y, p.src_h);
+                        for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                            const int64_t src_x = ggml_wrap_coord(src_x_base + knl_x * p.dilation_x, p.src_w);
+                            GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
+                            GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
+                            sum = GGML_F32_VEC_FMA(sum, k, s);
+                        }
                     }
-                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
-                        if (src_x < 0 || src_x >= p.src_w) {
-                            continue;
+                    GGML_F32_VEC_STORE(dst_data + c_i, sum);
+                }
+    #endif
+                // Scalar loop
+                for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
+                    float sum = 0.0f;
+                    for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                        const int64_t src_y = ggml_wrap_coord(src_y_base + knl_y * p.dilation_y, p.src_h);
+                        for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                            const int64_t src_x = ggml_wrap_coord(src_x_base + knl_x * p.dilation_x, p.src_w);
+                            sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
+                                * src_data[(src_y * p.src_w + src_x) * c + c_i];
                         }
-                        sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
-                             * src_data[(src_y * p.src_w + src_x) * c + c_i];
                     }
+                    dst_data[c_i] = sum;
                 }
-                dst_data[c_i] = sum;
             }
         }
     }
+
 }
 
 static void ggml_compute_forward_conv_2d_dw_whcn(
@@ -7156,30 +7241,57 @@ static void ggml_compute_forward_conv_2d_dw_whcn(
     const int64_t start = params->ith * per_thread;
     const int64_t end = MIN(start + per_thread, n);
 
-    for (int64_t i = start; i < end; ++i) {
-        const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
-        const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
-        float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
+    const int64_t circular = p.circular;
 
-        for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
-            for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
+    if (circular == 0) {
+        for (int64_t i = start; i < end; ++i) {
+            const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
+            const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
+            float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
 
-                float sum = 0.0f;
-                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-                    const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-                    if (src_y < 0 || src_y >= p.src_h) {
-                        continue;
-                    }
-                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-                        const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-                        if (src_x < 0 || src_x >= p.src_w) {
+            for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
+                for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
+
+                    float sum = 0.0f;
+                    for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                        const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+                        if (src_y < 0 || src_y >= p.src_h) {
                             continue;
                         }
-                        sum += knl_data[knl_y * p.knl_w + knl_x]
-                             * src_data[src_y * p.src_w + src_x];
+                        for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                            const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                            if (src_x < 0 || src_x >= p.src_w) {
+                                continue;
+                            }
+                            sum += knl_data[knl_y * p.knl_w + knl_x]
+                                * src_data[src_y * p.src_w + src_x];
+                        }
                     }
+                    dst_data[dst_y * p.dst_w + dst_x] = sum;
+                }
+            }
+        }
+    }
+    else {
+        for (int64_t i = start; i < end; ++i) {
+            const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
+            const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
+            float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
+
+            for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
+                for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
+
+                    float sum = 0.0f;
+                    for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                        const int64_t src_y = ggml_wrap_coord(dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y, p.src_h);
+                        for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                            const int64_t src_x = ggml_wrap_coord(dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x, p.src_w);
+                            sum += knl_data[knl_y * p.knl_w + knl_x]
+                                * src_data[src_y * p.src_w + src_x];
+                        }
+                    }
+                    dst_data[dst_y * p.dst_w + dst_x] = sum;
                 }
-                dst_data[dst_y * p.dst_w + dst_x] = sum;
             }
         }
     }
@@ -7206,6 +7318,7 @@ void ggml_compute_forward_conv_2d_dw(
     p.pad_y = dst->op_params[3];
     p.dilation_x = dst->op_params[4];
     p.dilation_y = dst->op_params[5];
+    p.circular = dst->op_params[6];
 
     GGML_ASSERT(kernel->ne[3] == p.channels);
     GGML_ASSERT(dst->ne[3] == p.batch);
@@ -7626,24 +7739,51 @@ static void ggml_compute_forward_pad_f32(
     const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
     const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
     const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
+    const int32_t circular = ggml_get_op_params_i32(dst, 8);
 
 
     // TODO: optimize
 
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-                    if ((i0 >= lp0 && i0 < ne0 - rp0) \
-                         && (i1 >= lp1 && i1 < ne1 - rp1) \
-                         && (i2 >= lp2 && i2 < ne2 - rp2) \
-                         && (i3 >= lp3 && i3 < ne3 - rp3)) {
-                        const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
+    if (circular == 0)  {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                        const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+                        if ((i0 >= lp0 && i0 < ne0 - rp0) \
+                            && (i1 >= lp1 && i1 < ne1 - rp1) \
+                            && (i2 >= lp2 && i2 < ne2 - rp2) \
+                            && (i3 >= lp3 && i3 < ne3 - rp3)) {
+                            const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
+                            const float * src_ptr = (const float *)((char *) src0->data + src_idx);
+                            dst_ptr[dst_idx] = *src_ptr;
+                        } else {
+                            dst_ptr[dst_idx] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                        const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+                        const int64_t src_i0 = ggml_wrap_coord(i0 - lp0, ne00);
+                        const int64_t src_i1 = ggml_wrap_coord(i1 - lp1, ne01);
+                        const int64_t src_i2 = ggml_wrap_coord(i2 - lp2, ne02);
+                        const int64_t src_i3 = ggml_wrap_coord(i3 - lp3, ne03);
+
+                        const int64_t src_idx =
+                            src_i3*nb03 +
+                            src_i2*nb02 +
+                            src_i1*nb01 +
+                            src_i0*nb00;
+                        
                         const float * src_ptr = (const float *)((char *) src0->data + src_idx);
                         dst_ptr[dst_idx] = *src_ptr;
-                    } else {
-                        dst_ptr[dst_idx] = 0;
                     }
                 }
             }

From 1b62b49aa82bf1e2eb69a43f77881ab6b3d6b5f9 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Mon, 3 Nov 2025 14:34:56 -0800
Subject: [PATCH 3/5] Feat: Added cuda kernels

---
 ggml/src/ggml-cuda/conv2d-dw.cu        | 71 ++++++++++++++++------
 ggml/src/ggml-cuda/conv2d-transpose.cu | 82 ++++++++++++++++++--------
 ggml/src/ggml-cuda/conv2d.cu           | 60 ++++++++++++++-----
 ggml/src/ggml-cuda/pad.cu              | 66 +++++++++++++++------
 4 files changed, 204 insertions(+), 75 deletions(-)

diff --git a/ggml/src/ggml-cuda/conv2d-dw.cu b/ggml/src/ggml-cuda/conv2d-dw.cu
index 7583233b1b7cd..255f131a6ae2d 100644
--- a/ggml/src/ggml-cuda/conv2d-dw.cu
+++ b/ggml/src/ggml-cuda/conv2d-dw.cu
@@ -8,6 +8,7 @@ struct conv_params {
     int padding_x, padding_y;
     int dilation_x, dilation_y;
     int channels, batches;
+    int circular;
 };
 
 struct kernel_bounds {
@@ -17,14 +18,23 @@ struct kernel_bounds {
 
 __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
     kernel_bounds bounds;
-    bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
-    bounds.y_max =
-        min(params.kernel_h,
-            (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
-    bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
-    bounds.x_max =
-        min(params.kernel_w,
-            (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+    if (params.circular) {
+        bounds.y_min = 0;
+        bounds.y_max = params.kernel_h;
+        bounds.x_min = 0;
+        bounds.x_max = params.kernel_w;
+    }
+    else {
+        bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+        bounds.y_max =
+            min(params.kernel_h,
+                (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+        bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+        bounds.x_max =
+            min(params.kernel_w,
+                (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+
+    }
     return bounds;
 }
 
@@ -32,6 +42,10 @@ __device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coo
     return out_coord * stride + kern_coord * dilation - padding;
 }
 
+__device__ __forceinline__ int wrap_coord(int coord, int size) {
+    return (coord % size + size) % size;
+}
+
 struct whcn_layout {
     __device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
         return n * (params.channels * params.in_w * params.in_h) + c * params.in_w * params.in_h + y * params.in_w + x;
@@ -83,7 +97,8 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
                                  const int in_w, const int in_h, const int out_w, const int out_h,
                                  const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
                                  const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
-                                 const int channels, const int batches) {
+                                 const int channels, const int batches,
+                                 const int circular) {
     const int global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
     const int total_elements = batches * channels * out_h * out_w;
 
@@ -92,7 +107,7 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
     }
 
     conv_params params = { in_w,     in_h,      out_w,     out_h,      kernel_w,   kernel_h, stride_x,
-                           stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches };
+                           stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches, circular };
 
     int batch_idx, channel_idx, out_y_idx, out_x_idx;
     Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx);
@@ -100,18 +115,35 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
     T accumulator = 0;
     kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params);
 
-    for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
-        int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
+    if (params.circular == 0) {
+        for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
+            int src_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
+
+            for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
+                int src_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
+
+                const T input_val  = input[Layout::input_index(batch_idx, channel_idx, src_y_idx, src_x_idx, params)];
+                const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
+
+                accumulator += input_val * kernel_val;
+            }
+        }
+    }
+    else {
+        for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
+            int in_y_idx = wrap_coord(calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y), params.in_h);
 
-        for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
-            int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
+            for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
+                int in_x_idx = wrap_coord(calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x), params.in_w);
 
-            const T input_val  = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)];
-            const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
+                const T input_val  = input[Layout::input_index(batch_idx, channel_idx, src_y_idx, src_x_idx, params)];
+                const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
 
-            accumulator += input_val * kernel_val;
+                accumulator += input_val * kernel_val;
+            }
         }
     }
+    
 
     output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator;
 }
@@ -132,6 +164,7 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     const int       padding_y  = p[3];
     const int       dilation_x = p[4];
     const int       dilation_y = p[5];
+    const int       circular   = p[6];
 
     const int in_w     = input->ne[0];
     const int in_h     = input->ne[1];
@@ -150,11 +183,11 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     if (ggml_is_contiguous(input)) {
         conv2d_dw_kernel<float, whcn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
             x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
-            dilation_x, dilation_y, channels, batches);
+            dilation_x, dilation_y, channels, batches, circular);
     } else if (ggml_is_contiguous_channels(input)) {
         conv2d_dw_kernel<float, cwhn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
             x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
-            dilation_x, dilation_y, channels, batches);
+            dilation_x, dilation_y, channels, batches, circular);
     } else {
         GGML_ABORT("Unsupported memory layout for conv_2d_dw");
     }
diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cu b/ggml/src/ggml-cuda/conv2d-transpose.cu
index 03224e404d32d..253f1dc3613de 100644
--- a/ggml/src/ggml-cuda/conv2d-transpose.cu
+++ b/ggml/src/ggml-cuda/conv2d-transpose.cu
@@ -3,10 +3,16 @@
 #include "conv2d-transpose.cuh"
 #include "ggml.h"
 
+
+__device__ __forceinline__ int wrap_coord(int coord, int size) {
+    return (coord % size + size) % size;
+}
+
 __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
                                         float * __restrict__ output, const int in_w, const int in_h, const int out_w,
                                         const int out_h, const int kernel_w, const int kernel_h, const int stride,
-                                        const int c_in, const int c_out, const int batches) {
+                                        const int c_in, const int c_out, const int batches,
+                                        const int circular) {
     const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     const int total_elements = out_w * out_h * c_out * batches;
@@ -22,28 +28,55 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const
 
     float accumulator = 0;
     // For each output idx, find the inputs that contribute to it by checking stride alignment and bounds
-
-    for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
-        for (int kh = 0; kh < kernel_h; ++kh) {
-            int in_y = out_y_idx - kh;
-            if (in_y < 0 || in_y % stride) continue;
-            in_y /= stride;
-            if (in_y >= in_h) continue;
-
-            for (int kw = 0; kw < kernel_w; ++kw) {
-                int in_x = out_x_idx - kw;
-                if (in_x < 0 || in_x % stride) continue;
-                in_x /= stride;
-                if (in_x >= in_w) continue;
-
-                const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
-                const int kernel_idx =
-                    (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
-
-                float input_val = input[input_idx];
-                half  kern_val  = kernel[kernel_idx];
-
-                accumulator += input_val * (float) kern_val;
+    if (circular == 0) {
+        for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
+            for (int kh = 0; kh < kernel_h; ++kh) {
+                int in_y = out_y_idx - kh;
+                if (in_y < 0 || in_y % stride) continue;
+                in_y /= stride;
+                if (in_y >= in_h) continue;
+
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                    int in_x = out_x_idx - kw;
+                    if (in_x < 0 || in_x % stride) continue;
+                    in_x /= stride;
+                    if (in_x >= in_w) continue;
+
+                    const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
+                    const int kernel_idx =
+                        (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
+
+                    float input_val = input[input_idx];
+                    half  kern_val  = kernel[kernel_idx];
+
+                    accumulator += input_val * (float) kern_val;
+                }
+            }
+        }
+    }
+    else {
+        for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
+            for (int kh = 0; kh < kernel_h; ++kh) {
+                int in_y = out_y_idx - kh;
+                if (in_y % stride) continue;
+                in_y /= stride;
+                in_y = wrap_coord(in_y, in_h);
+
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                    int in_x = out_x_idx - kw;
+                    if (in_x % stride) continue;
+                    in_x /= stride;
+                    in_x = wrap_coord(in_x, in_w);
+
+                    const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
+                    const int kernel_idx =
+                        (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
+
+                    float input_val = input[input_idx];
+                    half  kern_val  = kernel[kernel_idx];
+
+                    accumulator += input_val * (float) kern_val;
+                }
             }
         }
     }
@@ -72,6 +105,7 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor
     const int kernel_h     = kernel->ne[1];
     const int stride       = dst->op_params[0];
     const int batches      = input->ne[3];
+    const int circular     = dst->op_params[1];
 
     GGML_ASSERT(channels_in == kernel->ne[3]);
     GGML_ASSERT(stride > 0);
@@ -87,5 +121,5 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor
 
     conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
         input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
-        channels_in, channels_out, batches);
+        channels_in, channels_out, batches, circular);
 }
diff --git a/ggml/src/ggml-cuda/conv2d.cu b/ggml/src/ggml-cuda/conv2d.cu
index 142dd66903aaa..e9d78df2c88bd 100644
--- a/ggml/src/ggml-cuda/conv2d.cu
+++ b/ggml/src/ggml-cuda/conv2d.cu
@@ -11,6 +11,7 @@ struct conv_params {
     const int64_t IC, OC;
     const int64_t B;
     const int64_t TOTAL;
+    const int64_t CIRCULAR;
 };
 
 struct kernel_bounds {
@@ -26,12 +27,24 @@ __device__ __forceinline__ int64_t min64(int64_t a, int64_t b) {
     return (a < b) ? a : b;
 }
 
+__device__ __forceinline__ int wrap_coord(int coord, int size) {
+    return (coord % size + size) % size;
+}
+
 __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv_params & P) {
     kernel_bounds bounds;
-    bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
-    bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
-    bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
-    bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+    if (P.CIRCULAR) {
+        bounds.y_min = 0;
+        bounds.y_max = P.KH;
+        bounds.x_min = 0;
+        bounds.x_max = P.KW;
+    }
+    else {
+        bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
+        bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
+        bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+        bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+    }
     return bounds;
 }
 
@@ -84,19 +97,37 @@ static __global__ void conv2d_kernel(const float * __restrict__ input,
     Layout::unpack_indices(global_idx, P, n, c_out, out_y, out_x);
 
     float acc = 0.0f;
+    if (P.CIRCULAR == 0) {
+        for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
+            kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
+
+            for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
+                const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
 
-    for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
-        kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
+                for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
+                    const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
+
+                    const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
+                    const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
+                    acc += (input_val * ggml_cuda_cast<float>(kernel_val));
+                }
+            }
+        }
+    }
+    else {
+        for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
+            kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
 
-        for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
-            const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
+            for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
+                const int64_t in_y = wrap_coord(calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y), P.IH);
 
-            for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
-                const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
+                for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
+                    const int64_t in_x = wrap_coord(calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X), P.IW);
 
-                const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
-                const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
-                acc += (input_val * ggml_cuda_cast<float>(kernel_val));
+                    const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
+                    const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
+                    acc += (input_val * ggml_cuda_cast<float>(kernel_val));
+                }
             }
         }
     }
@@ -141,6 +172,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int       PD_Y = p[3];  // padding_y
     const int       DL_X = p[4];  // dilation_x
     const int       DL_Y = p[5];  // dilation_y
+    const int   CIRCULAR = p[6];
 
     // No cwhn
     GGML_ASSERT(p[6] == false);
@@ -156,7 +188,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int B  = input->ne[3];   // n_batches
 
     const int64_t total  = B * OC * OH * OW;
-    conv_params   params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total };
+    conv_params   params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total, CIRCULAR };
 
     if (kernel->type == GGML_TYPE_F16) {
         conv2d_cuda_f16(X_D, (half *) K_D, Y_D, params, st);
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
index 29aef33c1a4b8..f3f06897e4299 100644
--- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu
@@ -1,9 +1,18 @@
+#include <stdint.h>
+
 #include "pad.cuh"
 
+
+
+__device__ __forceinline__ int64_t wrap_coord(int64_t coord, int64_t size) {
+    return (coord % size + size) % size;
+}
+
 static __global__ void pad_f32(const float * src, float * dst,
                                const int lp0, const int rp0, const int lp1, const int rp1,
                                const int lp2, const int rp2, const int lp3, const int rp3,
-                               const int ne0, const int ne1, const int ne2, const int ne3) {
+                               const int ne0, const int ne1, const int ne2, const int ne3,
+                               const int circular) {
     // blockIdx.z: i3*ne2+i2
     // blockIdx.y: i1
     // blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE
@@ -12,39 +21,59 @@ static __global__ void pad_f32(const float * src, float * dst,
     int i1 = blockIdx.y;
     int i2 = blockIdx.z % ne2;
     int i3 = blockIdx.z / ne2;
+
     if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
         return;
     }
-
-    // operation
+    
     const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-    if ((i0 >= lp0 && i0 < ne0 - rp0) &&
-        (i1 >= lp1 && i1 < ne1 - rp1) &&
-        (i2 >= lp2 && i2 < ne2 - rp2) &&
-        (i3 >= lp3 && i3 < ne3 - rp3)) {
-        const int64_t i00 = i0 - lp0;
-        const int64_t i01 = i1 - lp1;
-        const int64_t i02 = i2 - lp2;
-        const int64_t i03 = i3 - lp3;
-        const int64_t ne02 = ne2 - lp2 - rp2;
-        const int64_t ne01 = ne1 - lp1 - rp1;
+
+    if (circular == 0) {
+        // operation
+        if ((i0 >= lp0 && i0 < ne0 - rp0) &&
+            (i1 >= lp1 && i1 < ne1 - rp1) &&
+            (i2 >= lp2 && i2 < ne2 - rp2) &&
+            (i3 >= lp3 && i3 < ne3 - rp3)) {
+            const int64_t i00 = i0 - lp0;
+            const int64_t i01 = i1 - lp1;
+            const int64_t i02 = i2 - lp2;
+            const int64_t i03 = i3 - lp3;
+            const int64_t ne02 = ne2 - lp2 - rp2;
+            const int64_t ne01 = ne1 - lp1 - rp1;
+            const int64_t ne00 = ne0 - lp0 - rp0;
+
+            const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
+
+            dst[dst_idx] = src[src_idx];
+        } else {
+            dst[dst_idx] = 0.0f;
+        }
+    }
+    else {
         const int64_t ne00 = ne0 - lp0 - rp0;
+        const int64_t ne01 = ne1 - lp1 - rp1;
+        const int64_t ne02 = ne2 - lp2 - rp2;
+        const int64_t ne03 = ne3 - lp3 - rp3;
+
+        const int64_t i00 = wrap_coord(i0 - lp0, ne00);
+        const int64_t i01 = wrap_coord(i1 - lp1, ne01);
+        const int64_t i02 = wrap_coord(i2 - lp2, ne02);
+        const int64_t i03 = wrap_coord(i3 - lp3, ne03);
 
         const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
 
         dst[dst_idx] = src[src_idx];
-    } else {
-        dst[dst_idx] = 0.0f;
     }
 }
 
 static void pad_f32_cuda(const float * src, float * dst,
     const int lp0, const int rp0, const int lp1, const int rp1,
     const int lp2, const int rp2, const int lp3, const int rp3,
-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+    const int ne0, const int ne1, const int ne2, const int ne3,
+    const int circular, cudaStream_t stream) {
     int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
     dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3);
+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3, circular);
 }
 
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -65,8 +94,9 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int32_t rp2 = ((const int32_t*)(dst->op_params))[5];
     const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
     const int32_t rp3 = ((const int32_t*)(dst->op_params))[7];
+    const int32_t circular = ((const int32_t*)(dst->op_params))[8]; 
 
     pad_f32_cuda(src0_d, dst_d,
                  lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
-                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], circular, stream);
 }

From 60bed3b95c653fc8fdebd5c429f8d5d50a37bf0d Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Mon, 3 Nov 2025 15:03:52 -0800
Subject: [PATCH 4/5] Added tests

---
 tests/test-backend-ops.cpp | 463 +++++++++++++++++++++++++++++++++++--
 1 file changed, 444 insertions(+), 19 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 967a53c63d86d..4e0546cb4d8f4 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4454,6 +4454,7 @@ struct test_conv_2d : public test_case {
     const int                    dilation1;
     // Whether the inputs are contiguous in the channel dim or the width dim
     const bool                   cwhn;
+    const bool                   circular;
 
     // If true, the direct CONV_2D will be used in the graph, otherwise it
     // uses ggml_conv_2d:
@@ -4463,7 +4464,7 @@ struct test_conv_2d : public test_case {
     // IM2COL -> MUL_MM graph will be built.
 
     std::string vars() override {
-        return VARS_TO_STR10(ne_input, ne_kernel, type_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn);
+        return VARS_TO_STR11(ne_input, ne_kernel, type_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn, circular);
     }
 
     double max_nmse_err() override {
@@ -4499,7 +4500,8 @@ struct test_conv_2d : public test_case {
 
     test_conv_2d(std::array<int64_t, 4> ne_input  = { 64, 64, 16, 1 },
                  std::array<int64_t, 4> ne_kernel = { 3, 3, 1, 16 }, ggml_type type_kernel = GGML_TYPE_F32, int stride0 = 1,
-                 int stride1 = 1, int padding0 = 0, int padding1 = 0, int dilation0 = 1, int dilation1 = 1, bool cwhn = false) :
+                 int stride1 = 1, int padding0 = 0, int padding1 = 0, int dilation0 = 1, int dilation1 = 1,
+                 bool cwhn = false, bool circular = false) :
         ne_input(ne_input),
         ne_kernel(ne_kernel),
         type_kernel(type_kernel),
@@ -4509,7 +4511,8 @@ struct test_conv_2d : public test_case {
         padding1(padding1),
         dilation0(dilation0),
         dilation1(dilation1),
-        cwhn(cwhn) {}
+        cwhn(cwhn),
+        circular(circular) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
@@ -4527,8 +4530,58 @@ struct test_conv_2d : public test_case {
             kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
         }
 
-        ggml_tensor * out =
-            ggml_conv_2d_direct(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
+        ggml_tensor * out = circular
+            ? ggml_conv_2d_direct_circular(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1)
+            : ggml_conv_2d_direct(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+struct test_conv_2d_im2col : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    const ggml_type              type_kernel;
+    const int                    stride0;
+    const int                    stride1;
+    const int                    padding0;
+    const int                    padding1;
+    const int                    dilation0;
+    const int                    dilation1;
+    const bool                   circular;
+
+    std::string vars() override {
+        return VARS_TO_STR10(ne_input, ne_kernel, type_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, circular);
+    }
+
+    test_conv_2d_im2col(std::array<int64_t, 4> ne_input  = { 32, 24, 8, 2 },
+                        std::array<int64_t, 4> ne_kernel = { 3, 3, 8, 4 },
+                        ggml_type type_kernel = GGML_TYPE_F32,
+                        int stride0 = 1, int stride1 = 1,
+                        int padding0 = 0, int padding1 = 0,
+                        int dilation0 = 1, int dilation1 = 1,
+                        bool circular = false)
+        : ne_input(ne_input),
+          ne_kernel(ne_kernel),
+          type_kernel(type_kernel),
+          stride0(stride0),
+          stride1(stride1),
+          padding0(padding0),
+          padding1(padding1),
+          dilation0(dilation0),
+          dilation1(dilation1),
+          circular(circular) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = circular
+            ? ggml_conv_2d_circular(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1)
+            : ggml_conv_2d(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
         ggml_set_name(out, "out");
         return out;
     }
@@ -4542,15 +4595,16 @@ struct test_conv_2d_dw : public test_case {
     const int padding;
     const int dilation;
     const bool cwhn;
+    const bool circular;
 
     std::string vars() override {
-        return VARS_TO_STR6(ne_input, ne_kernel, stride, padding, dilation, cwhn);
+        return VARS_TO_STR7(ne_input, ne_kernel, stride, padding, dilation, cwhn, circular);
     }
 
     test_conv_2d_dw(std::array<int64_t, 4> ne_input = {64, 64, 16, 1},
             std::array<int64_t, 4> ne_kernel = {3, 3, 1, 16},
-            int stride = 1, int padding = 0, int dilation = 1, bool cwhn = false)
-        : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), padding(padding), dilation(dilation), cwhn(cwhn) {}
+            int stride = 1, int padding = 0, int dilation = 1, bool cwhn = false, bool circular = false)
+        : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), padding(padding), dilation(dilation), cwhn(cwhn), circular(circular) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
@@ -4568,14 +4622,270 @@ struct test_conv_2d_dw : public test_case {
             kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
         }
 
-        ggml_tensor * out = ggml_conv_2d_dw_direct(
-            ctx, kernel, input,
-            stride, stride, padding, padding, dilation, dilation);
+        ggml_tensor * out = circular
+            ? ggml_conv_2d_dw_direct_circular(ctx, kernel, input,
+                                              stride, stride, padding, padding, dilation, dilation)
+            : ggml_conv_2d_dw_direct(ctx, kernel, input,
+                                     stride, stride, padding, padding, dilation, dilation);
         ggml_set_name(out, "out");
         return out;
     }
 };
 
+struct test_conv_2d_direct_circular_manual : public test_case {
+    const std::array<int64_t, 4> ne_input{5, 4, 1, 1};
+    const std::array<int64_t, 4> ne_kernel{3, 3, 1, 1};
+    const int stride0   = 1;
+    const int stride1   = 1;
+    const int padding0  = 2;
+    const int padding1  = 1;
+    const int dilation0 = 1;
+    const int dilation1 = 1;
+
+    ggml_tensor * input    = nullptr;
+    ggml_tensor * kernel   = nullptr;
+    ggml_tensor * expected = nullptr;
+
+    std::string vars() override {
+        return "manual_conv2d_direct_circular";
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        input  = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
+        ggml_set_name(input, "input");
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * actual = ggml_conv_2d_direct_circular(
+            ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
+        ggml_set_name(actual, "actual");
+
+        int64_t ne_out[4] = {
+            conv_out_size(ne_input[0], ne_kernel[0], stride0, padding0, dilation0),
+            conv_out_size(ne_input[1], ne_kernel[1], stride1, padding1, dilation1),
+            ne_kernel[3],
+            ne_input[3],
+        };
+
+        expected = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_out);
+        ggml_set_name(expected, "expected");
+
+        ggml_tensor * diff = ggml_sub(ctx, actual, expected);
+        ggml_tensor * sq   = ggml_sqr(ctx, diff);
+        ggml_tensor * loss = ggml_sum(ctx, sq);
+        ggml_set_name(loss, "loss");
+        return loss;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        test_case::initialize_tensors(ctx);
+
+        std::vector<float> input_data(ggml_nelements(input));
+        for (size_t i = 0; i < input_data.size(); ++i) {
+            input_data[i] = static_cast<float>(std::sin(static_cast<double>(i + 1)));
+        }
+        ggml_backend_tensor_set(input, input_data.data(), 0, input_data.size() * sizeof(float));
+
+        std::vector<float> kernel_data(ggml_nelements(kernel));
+        for (size_t i = 0; i < kernel_data.size(); ++i) {
+            kernel_data[i] = static_cast<float>(std::cos(static_cast<double>(i + 1)));
+        }
+        ggml_backend_tensor_set(kernel, kernel_data.data(), 0, kernel_data.size() * sizeof(float));
+
+        int64_t ne_out[4] = {
+            conv_out_size(ne_input[0], ne_kernel[0], stride0, padding0, dilation0),
+            conv_out_size(ne_input[1], ne_kernel[1], stride1, padding1, dilation1),
+            ne_kernel[3],
+            ne_input[3],
+        };
+        std::vector<float> expected_data(ggml_nelements(expected), 0.0f);
+
+        for (int64_t n = 0; n < ne_input[3]; ++n) {
+            for (int64_t oc = 0; oc < ne_kernel[3]; ++oc) {
+                for (int64_t oy = 0; oy < ne_out[1]; ++oy) {
+                    for (int64_t ox = 0; ox < ne_out[0]; ++ox) {
+                        float sum = 0.0f;
+                        for (int64_t ic = 0; ic < ne_kernel[2]; ++ic) {
+                            for (int64_t ky = 0; ky < ne_kernel[1]; ++ky) {
+                                const int64_t in_y = wrap_coord_circular(
+                                        oy * stride1 + ky * dilation1 - padding1, ne_input[1]);
+                                for (int64_t kx = 0; kx < ne_kernel[0]; ++kx) {
+                                    const int64_t in_x = wrap_coord_circular(
+                                            ox * stride0 + kx * dilation0 - padding0, ne_input[0]);
+                                    const int64_t src_idx = offset4d(ne_input.data(), in_x, in_y, ic, n);
+                                    const int64_t ker_idx = offset4d(ne_kernel.data(), kx, ky, ic, oc);
+                                    sum += input_data[src_idx] * kernel_data[ker_idx];
+                                }
+                            }
+                        }
+                        expected_data[offset4d(ne_out, ox, oy, oc, n)] = sum;
+                    }
+                }
+            }
+        }
+
+        ggml_backend_tensor_set(expected, expected_data.data(), 0, expected_data.size() * sizeof(float));
+    }
+
+    double max_nmse_err() override {
+        return 1e-8;
+    }
+};
+
+struct test_conv_2d_dw_direct_circular_manual : public test_case {
+    const std::array<int64_t, 4> ne_input{4, 3, 2, 1};
+    const std::array<int64_t, 4> ne_kernel{3, 2, 1, 2};
+    const int stride   = 1;
+    const int padding  = 1;
+    const int dilation = 1;
+
+    ggml_tensor * input    = nullptr;
+    ggml_tensor * kernel   = nullptr;
+    ggml_tensor * expected = nullptr;
+
+    std::string vars() override {
+        return "manual_conv2d_dw_direct_circular";
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        input  = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
+        ggml_set_name(input, "input");
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * actual = ggml_conv_2d_dw_direct_circular(
+                ctx, kernel, input, stride, stride, padding, padding, dilation, dilation);
+        ggml_set_name(actual, "actual");
+
+        int64_t ne_out[4] = {
+            conv_out_size(ne_input[0], ne_kernel[0], stride, padding, dilation),
+            conv_out_size(ne_input[1], ne_kernel[1], stride, padding, dilation),
+            ne_input[2],
+            ne_input[3],
+        };
+        expected = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_out);
+        ggml_set_name(expected, "expected");
+
+        ggml_tensor * diff = ggml_sub(ctx, actual, expected);
+        ggml_tensor * sq   = ggml_sqr(ctx, diff);
+        ggml_tensor * loss = ggml_sum(ctx, sq);
+        ggml_set_name(loss, "loss");
+        return loss;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        test_case::initialize_tensors(ctx);
+
+        std::vector<float> input_data(ggml_nelements(input));
+        for (size_t i = 0; i < input_data.size(); ++i) {
+            input_data[i] = static_cast<float>(i % 7);
+        }
+        ggml_backend_tensor_set(input, input_data.data(), 0, input_data.size() * sizeof(float));
+
+        std::vector<float> kernel_data(ggml_nelements(kernel));
+        for (size_t i = 0; i < kernel_data.size(); ++i) {
+            kernel_data[i] = static_cast<float>((i % 5) - 2);
+        }
+        ggml_backend_tensor_set(kernel, kernel_data.data(), 0, kernel_data.size() * sizeof(float));
+
+        int64_t ne_out[4] = {
+            conv_out_size(ne_input[0], ne_kernel[0], stride, padding, dilation),
+            conv_out_size(ne_input[1], ne_kernel[1], stride, padding, dilation),
+            ne_input[2],
+            ne_input[3],
+        };
+
+        std::vector<float> expected_data(ggml_nelements(expected), 0.0f);
+        for (int64_t n = 0; n < ne_input[3]; ++n) {
+            for (int64_t c = 0; c < ne_input[2]; ++c) {
+                for (int64_t oy = 0; oy < ne_out[1]; ++oy) {
+                    for (int64_t ox = 0; ox < ne_out[0]; ++ox) {
+                        float sum = 0.0f;
+                        for (int64_t ky = 0; ky < ne_kernel[1]; ++ky) {
+                            const int64_t in_y = wrap_coord_circular(
+                                    oy * stride + ky * dilation - padding, ne_input[1]);
+                            for (int64_t kx = 0; kx < ne_kernel[0]; ++kx) {
+                                const int64_t in_x = wrap_coord_circular(
+                                        ox * stride + kx * dilation - padding, ne_input[0]);
+                                const int64_t src_idx = offset4d(ne_input.data(), in_x, in_y, c, n);
+                                const int64_t ker_idx = offset4d(ne_kernel.data(), kx, ky, 0, c);
+                                sum += input_data[src_idx] * kernel_data[ker_idx];
+                            }
+                        }
+                        expected_data[offset4d(ne_out, ox, oy, c, n)] = sum;
+                    }
+                }
+            }
+        }
+
+        ggml_backend_tensor_set(expected, expected_data.data(), 0, expected_data.size() * sizeof(float));
+    }
+
+    double max_nmse_err() override {
+        return 1e-8;
+    }
+};
+
+struct test_conv_2d_circular_pipeline : public test_case {
+    const std::array<int64_t, 4> ne_input{6, 5, 3, 2};
+    const std::array<int64_t, 4> ne_kernel{3, 3, 3, 4};
+    const int stride0   = 2;
+    const int stride1   = 1;
+    const int padding0  = 1;
+    const int padding1  = 2;
+    const int dilation0 = 1;
+    const int dilation1 = 2;
+
+    ggml_tensor * input  = nullptr;
+    ggml_tensor * kernel = nullptr;
+
+    std::string vars() override {
+        return "conv2d_circular_vs_pipeline";
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        input  = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
+        ggml_set_name(input, "input");
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * actual = ggml_conv_2d_circular(
+            ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
+        ggml_set_name(actual, "actual");
+
+        ggml_tensor * padded = ggml_pad_ext_circular(ctx, input, padding0, padding0, padding1, padding1, 0, 0, 0, 0);
+        ggml_set_name(padded, "padded");
+        ggml_tensor * reference = ggml_conv_2d(ctx, kernel, padded, stride0, stride1, 0, 0, dilation0, dilation1);
+        ggml_set_name(reference, "reference");
+
+        ggml_tensor * diff = ggml_sub(ctx, actual, reference);
+        ggml_tensor * sq   = ggml_sqr(ctx, diff);
+        ggml_tensor * loss = ggml_sum(ctx, sq);
+        ggml_set_name(loss, "loss");
+        return loss;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        test_case::initialize_tensors(ctx);
+
+        std::vector<float> input_data(ggml_nelements(input));
+        for (size_t i = 0; i < input_data.size(); ++i) {
+            input_data[i] = static_cast<float>(std::fmod(static_cast<double>(i * 3 + 1), 17.0));
+        }
+        ggml_backend_tensor_set(input, input_data.data(), 0, input_data.size() * sizeof(float));
+
+        std::vector<float> kernel_data(ggml_nelements(kernel));
+        for (size_t i = 0; i < kernel_data.size(); ++i) {
+            kernel_data[i] = static_cast<float>(std::fmod(static_cast<double>(i * 7 + 3), 11.0) - 5.0);
+        }
+        ggml_backend_tensor_set(kernel, kernel_data.data(), 0, kernel_data.size() * sizeof(float));
+    }
+
+    double max_nmse_err() override {
+        return 1e-8;
+    }
+};
+
 // GGML_OP_CONV_3D
 struct test_conv_3d : public test_case {
     // Logical 5D dimensions
@@ -5288,26 +5598,43 @@ struct test_acc : public test_case {
 };
 
 // GGML_OP_PAD
+static inline int64_t wrap_coord_circular(int64_t coord, int64_t size) {
+    GGML_ASSERT(size > 0);
+    const int64_t mod = coord % size;
+    return mod < 0 ? mod + size : mod;
+}
+
+static inline int64_t offset4d(const int64_t ne[4], int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+    return ((i3 * ne[2] + i2) * ne[1] + i1) * ne[0] + i0;
+}
+
+static inline int64_t conv_out_size(int64_t ins, int64_t ks, int stride, int pad, int dilation) {
+    return (ins + 2 * pad - dilation * (ks - 1) - 1) / stride + 1;
+}
+
 struct test_pad : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne_a;
     const int pad_0;
     const int pad_1;
+    const bool circular;
 
     std::string vars() override {
-        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
+        return VARS_TO_STR5(type, ne_a, pad_0, pad_1, circular);
     }
 
     test_pad(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
-            int pad_0 = 1, int pad_1 = 1)
-        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
+            int pad_0 = 1, int pad_1 = 1, bool circular = false)
+        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1), circular(circular) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
         ggml_set_name(a, "a");
 
-        ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
+        ggml_tensor * out = circular
+            ? ggml_pad_circular(ctx, a, pad_0, pad_1, 0, 0)
+            : ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
         ggml_set_name(out, "out");
 
         return out;
@@ -5326,17 +5653,19 @@ struct test_pad_ext : public test_case {
     const int lp3;
     const int rp3;
     const bool v;
+    const bool circular;
 
     std::string vars() override {
-        return VARS_TO_STR11(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, v);
+        return VARS_TO_STR12(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, v, circular);
     }
 
     test_pad_ext(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne_a = {512, 512, 3, 1},
             int lp0 = 1, int rp0 = 1, int lp1 = 1, int rp1 = 1,
             int lp2 = 1, int rp2 = 1, int lp3 = 1, int rp3 = 1,
-            bool v = false)
-        : type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3), v(v) {}
+            bool v = false, bool circular = false)
+        : type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3),
+          v(v), circular(circular) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
@@ -5347,7 +5676,9 @@ struct test_pad_ext : public test_case {
             ggml_set_name(a, "view of a");
         }
 
-        ggml_tensor * out = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+        ggml_tensor * out = circular
+            ? ggml_pad_ext_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+            : ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
         ggml_set_name(out, "out");
 
         return out;
@@ -5355,6 +5686,83 @@ struct test_pad_ext : public test_case {
 };
 
 // GGML_OP_PAD_REFLECT_1D
+
+struct test_pad_ext_circular_manual : public test_case {
+    const std::array<int64_t, 4> ne_src{4, 3, 1, 1};
+    const std::array<int64_t, 4> pads_l{1, 2, 0, 0};
+    const std::array<int64_t, 4> pads_r{2, 1, 0, 0};
+
+    ggml_tensor * input     = nullptr;
+    ggml_tensor * expected  = nullptr;
+
+    std::string vars() override {
+        return "manual_pad_ext_circular";
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_src.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * actual = ggml_pad_ext_circular(ctx, input,
+                pads_l[0], pads_r[0], pads_l[1], pads_r[1], pads_l[2], pads_r[2], pads_l[3], pads_r[3]);
+        ggml_set_name(actual, "actual");
+
+        int64_t ne_dst[4] = {
+            ne_src[0] + pads_l[0] + pads_r[0],
+            ne_src[1] + pads_l[1] + pads_r[1],
+            ne_src[2] + pads_l[2] + pads_r[2],
+            ne_src[3] + pads_l[3] + pads_r[3],
+        };
+
+        expected = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_dst);
+        ggml_set_name(expected, "expected");
+
+        ggml_tensor * diff = ggml_sub(ctx, actual, expected);
+        ggml_tensor * sq   = ggml_sqr(ctx, diff);
+        ggml_tensor * loss = ggml_sum(ctx, sq);
+        ggml_set_name(loss, "loss");
+        return loss;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        test_case::initialize_tensors(ctx);
+
+        std::vector<float> src_data(ggml_nelements(input));
+        for (size_t i = 0; i < src_data.size(); ++i) {
+            src_data[i] = static_cast<float>(i + 1);
+        }
+        ggml_backend_tensor_set(input, src_data.data(), 0, src_data.size() * sizeof(float));
+
+        int64_t ne_dst[4] = {
+            ne_src[0] + pads_l[0] + pads_r[0],
+            ne_src[1] + pads_l[1] + pads_r[1],
+            ne_src[2] + pads_l[2] + pads_r[2],
+            ne_src[3] + pads_l[3] + pads_r[3],
+        };
+
+        std::vector<float> exp_data(ggml_nelements(expected));
+        for (int64_t i3 = 0; i3 < ne_dst[3]; ++i3) {
+            for (int64_t i2 = 0; i2 < ne_dst[2]; ++i2) {
+                for (int64_t i1 = 0; i1 < ne_dst[1]; ++i1) {
+                    for (int64_t i0 = 0; i0 < ne_dst[0]; ++i0) {
+                        const int64_t src_i0 = wrap_coord_circular(i0 - pads_l[0], ne_src[0]);
+                        const int64_t src_i1 = wrap_coord_circular(i1 - pads_l[1], ne_src[1]);
+                        const int64_t src_i2 = wrap_coord_circular(i2 - pads_l[2], ne_src[2]);
+                        const int64_t src_i3 = wrap_coord_circular(i3 - pads_l[3], ne_src[3]);
+                        exp_data[offset4d(ne_dst, i0, i1, i2, i3)] =
+                            src_data[offset4d(ne_src.data(), src_i0, src_i1, src_i2, src_i3)];
+                    }
+                }
+            }
+        }
+        ggml_backend_tensor_set(expected, exp_data.data(), 0, exp_data.size() * sizeof(float));
+    }
+
+    double max_nmse_err() override {
+        return 1e-8;
+    }
+};
+
 struct test_pad_reflect_1d : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne_a;
@@ -6477,10 +6885,23 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
     // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
 
+    test_cases.emplace_back(new test_conv_2d({37, 29, 3, 2}, {3, 2, 3, 5}, GGML_TYPE_F32, 1, 1, 4, 3, 1, 1, false, true));
+    test_cases.emplace_back(new test_conv_2d({19, 23, 4, 1}, {5, 3, 4, 4}, GGML_TYPE_F16, 2, 1, 3, 2, 1, 2, false, true));
+    test_cases.emplace_back(new test_conv_2d({16, 18, 6, 3}, {3, 3, 6, 8}, GGML_TYPE_F32, 1, 2, 2, 3, 1, 1, true, true));
+
+    test_cases.emplace_back(new test_conv_2d_im2col());
+    test_cases.emplace_back(new test_conv_2d_im2col({17, 13, 6, 2}, {3, 3, 6, 4}, GGML_TYPE_F32, 1, 2, 2, 3, 1, 1, true));
+    test_cases.emplace_back(new test_conv_2d_im2col({11, 7, 2, 1}, {3, 3, 2, 3}, GGML_TYPE_F16, 1, 1, 1, 1, 2, 1, true));
+    test_cases.emplace_back(new test_conv_2d_direct_circular_manual());
+
     test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, false));
     test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, true));
     test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false));
     test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true));
+    test_cases.emplace_back(new test_conv_2d_dw({29, 19, 8, 2}, {5, 3, 1, 8}, 1, 2, 1, false, true));
+    test_cases.emplace_back(new test_conv_2d_dw({24, 14, 16, 1}, {3, 3, 1, 16}, 2, 1, 2, true, true));
+    test_cases.emplace_back(new test_conv_2d_dw_direct_circular_manual());
+    test_cases.emplace_back(new test_conv_2d_circular_pipeline());
 
     // CONV_3D
     auto calc_conv_output_size_3d = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
@@ -7214,7 +7635,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {9, 9, 1280, 1}));
     test_cases.emplace_back(new test_acc());
     test_cases.emplace_back(new test_pad());
+    test_cases.emplace_back(new test_pad_ext_circular_manual());
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true));
     test_cases.emplace_back(new test_pad_ext());
+    test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {19, 11, 5, 2}, 2, 4, 1, 3, 0, 0, 0, 0, false, true));
     test_cases.emplace_back(new test_pad_reflect_1d());
     test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
     test_cases.emplace_back(new test_roll());
@@ -7225,6 +7649,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     for (bool v : {false, true}) {
         test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, v));
         test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {11, 22, 33, 44}, 1, 2, 3, 4, 5, 6, 7, 8, v));
+        test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {23, 17, 7, 3}, 2, 1, 3, 0, 1, 2, 0, 0, v, true));
     }
 
     for (int hsk : { 40, 64, 72, 80, 96, 128, 192, 256, 576 }) {

From 5700a4e7e92200a3119d36a4719390c787cc1831 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Mon, 3 Nov 2025 15:37:57 -0800
Subject: [PATCH 5/5] Added tests

---
 tests/test-backend-ops.cpp | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 4e0546cb4d8f4..29b2a77b60e0a 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4632,6 +4632,20 @@ struct test_conv_2d_dw : public test_case {
     }
 };
 
+static inline int64_t conv_out_size(int64_t ins, int64_t ks, int stride, int pad, int dilation) {
+    return (ins + 2 * pad - dilation * (ks - 1) - 1) / stride + 1;
+}
+// GGML_OP_PAD
+static inline int64_t wrap_coord_circular(int64_t coord, int64_t size) {
+    GGML_ASSERT(size > 0);
+    const int64_t mod = coord % size;
+    return mod < 0 ? mod + size : mod;
+}
+
+static inline int64_t offset4d(const int64_t ne[4], int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+    return ((i3 * ne[2] + i2) * ne[1] + i1) * ne[0] + i0;
+}
+
 struct test_conv_2d_direct_circular_manual : public test_case {
     const std::array<int64_t, 4> ne_input{5, 4, 1, 1};
     const std::array<int64_t, 4> ne_kernel{3, 3, 1, 1};
@@ -5597,20 +5611,7 @@ struct test_acc : public test_case {
     }
 };
 
-// GGML_OP_PAD
-static inline int64_t wrap_coord_circular(int64_t coord, int64_t size) {
-    GGML_ASSERT(size > 0);
-    const int64_t mod = coord % size;
-    return mod < 0 ? mod + size : mod;
-}
 
-static inline int64_t offset4d(const int64_t ne[4], int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-    return ((i3 * ne[2] + i2) * ne[1] + i1) * ne[0] + i0;
-}
-
-static inline int64_t conv_out_size(int64_t ins, int64_t ks, int stride, int pad, int dilation) {
-    return (ins + 2 * pad - dilation * (ks - 1) - 1) / stride + 1;
-}
 
 struct test_pad : public test_case {
     const ggml_type type;