From e6a3cc5192c117e3f106aa27aea841555398b125 Mon Sep 17 00:00:00 2001
From: taiqzheng <2013898008@qq.com>
Date: Mon, 27 Feb 2023 22:26:22 +0800
Subject: [PATCH] Add Halide Conv Layer Benchmark.

---
 CMakeLists.txt                                |   8 +
 README.md                                     |   7 +
 benchmarks/DeepLearning/CMakeLists.txt        |   1 +
 benchmarks/DeepLearning/Layers/CMakeLists.txt |  40 ++++
 .../Layers/HalideConvLayerBenchmark.cpp       |  87 ++++++++
 benchmarks/DeepLearning/Layers/Main.cpp       |  46 +++++
 .../conv_layer_generator-autoschedule.cpp     |  71 +++++++
 .../Layers/conv_layer_generator-manually.cpp  | 185 ++++++++++++++++++
 .../Layers/conv_layer_generator.cpp           |  52 +++++
 9 files changed, 497 insertions(+)
 create mode 100644 benchmarks/DeepLearning/Layers/CMakeLists.txt
 create mode 100644 benchmarks/DeepLearning/Layers/HalideConvLayerBenchmark.cpp
 create mode 100644 benchmarks/DeepLearning/Layers/Main.cpp
 create mode 100644 benchmarks/DeepLearning/Layers/conv_layer_generator-autoschedule.cpp
 create mode 100644 benchmarks/DeepLearning/Layers/conv_layer_generator-manually.cpp
 create mode 100644 benchmarks/DeepLearning/Layers/conv_layer_generator.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd777dd5..375561a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,14 @@ if(DEFINED IMAGE_PROCESSING_BENCHMARKS OR DEEP_LEARNING_BENCHMARKS OR OP_OPTIMIZ
   include_directories(${OpenCV_INCLUDE_DIRS})
 endif()
 
+#-------------------------------------------------------------------------------
+# Find Halide
+#-------------------------------------------------------------------------------
+
+if(DEFINED DEEP_LEARNING_BENCHMARKS)
+  find_package(Halide REQUIRED)
+endif()
+
 #-------------------------------------------------------------------------------
 # Find PNG
 #-------------------------------------------------------------------------------
diff --git a/README.md b/README.md
index a1882ac6..c372d075 100644
--- a/README.md
+++ b/README.md
@@ -82,6 +82,12 @@ $ cd bin && ./image-processing-benchmark <image path> <kernel name> <kernelmorph
 
 ## Deep Learning Benchmark
 
+Currently, the deep learning benchmark includes the following frameworks or optimizers:
+
+- Halide ([link](https://github.com/halide/Halide/blob/main/README_cmake.md))
+
+*NOTE: Please build Halide 15.0.0 from source to achieve the best performance.*
+
 | CMake Options  | Default Value |
 | -------------- | ------------- |
 | `-DBUDDY_OPT_ATTR`  | avx512f  |
@@ -97,6 +103,7 @@ $ cmake -G Ninja .. \
     -DDEEP_LEARNING_BENCHMARKS=ON \
     -DCMAKE_BUILD_TYPE=RELEASE \
     -DOpenCV_DIR=$PWD/../thirdparty/opencv/build/ \
+    -DCMAKE_PREFIX_PATH=$PWD/../thirdparty/Halide/Halide-install/ \
     -DBUDDY_MLIR_BUILD_DIR=/PATH/TO/BUDDY-MLIR/BUILD/
 $ ninja
 ```
diff --git a/benchmarks/DeepLearning/CMakeLists.txt b/benchmarks/DeepLearning/CMakeLists.txt
index bc94dac6..8bb94e62 100644
--- a/benchmarks/DeepLearning/CMakeLists.txt
+++ b/benchmarks/DeepLearning/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(Layers)
 add_subdirectory(Models)
 add_subdirectory(Ops)
 
diff --git a/benchmarks/DeepLearning/Layers/CMakeLists.txt b/benchmarks/DeepLearning/Layers/CMakeLists.txt
new file mode 100644
index 00000000..4a882aef
--- /dev/null
+++ b/benchmarks/DeepLearning/Layers/CMakeLists.txt
@@ -0,0 +1,40 @@
+#-------------------------------------------------------------------------------
+# Generate Non-Schedule Version Conv Layer Static Library
+#-------------------------------------------------------------------------------
+
+add_halide_generator(conv_layer_nonschedule.generator SOURCES conv_layer_generator.cpp)
+add_halide_library(conv_layer_nonschedule FROM conv_layer_nonschedule.generator)
+
+#-------------------------------------------------------------------------------
+# Generate Auto-Schedule Version Conv Layer Static Library
+#-------------------------------------------------------------------------------
+
+add_halide_generator(conv_layer_autoschedule.generator SOURCES conv_layer_generator-autoschedule.cpp)
+add_halide_library(conv_layer_autoschedule FROM conv_layer_autoschedule.generator
+                   AUTOSCHEDULER Halide::Mullapudi2016)
+
+#-------------------------------------------------------------------------------
+# Generate Manually-Schedule Version Conv Layer Static Library
+#-------------------------------------------------------------------------------
+
+add_halide_generator(conv_layer_manually.generator SOURCES conv_layer_generator-manually.cpp)
+add_halide_library(conv_layer_manuallyschedule FROM conv_layer_manually.generator)
+
+#-------------------------------------------------------------------------------
+# Halide ConvLayer Benchmark Target
+#-------------------------------------------------------------------------------
+
+add_executable(halide-convlayer-benchmark
+               Main.cpp 
+               HalideConvLayerBenchmark.cpp)
+
+add_custom_command(TARGET halide-convlayer-benchmark POST_BUILD
+                COMMAND ${CMAKE_COMMAND} -E rm ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/*.generator
+                COMMENT "Cleaning up all generator files")
+
+target_link_libraries(halide-convlayer-benchmark 
+                       GoogleBenchmark
+                       Halide::ImageIO
+                       conv_layer_nonschedule
+                       conv_layer_manuallyschedule
+                       conv_layer_autoschedule)
\ No newline at end of file
diff --git a/benchmarks/DeepLearning/Layers/HalideConvLayerBenchmark.cpp b/benchmarks/DeepLearning/Layers/HalideConvLayerBenchmark.cpp
new file mode 100644
index 00000000..9eef3c4d
--- /dev/null
+++ b/benchmarks/DeepLearning/Layers/HalideConvLayerBenchmark.cpp
@@ -0,0 +1,87 @@
+#include <chrono>
+#include <cstdio>
+
+#include "conv_layer_nonschedule.h"
+#include "conv_layer_manuallyschedule.h"
+#include "conv_layer_autoschedule.h"
+#include <benchmark/benchmark.h>
+#include "HalideBuffer.h"
+
+using namespace Halide::Runtime;
+
+const int N = 5, CI = 128, CO = 128, W = 100, H = 80;
+
+Buffer<float, 4> input(CI, W + 2, H + 2, N), input1(CI, W + 2, H + 2, N), input2(CI, W + 2, H + 2, N);
+Buffer<float, 4> filter(CO, 3, 3, CI), filter1(CO, 3, 3, CI), filter2(CO, 3, 3, CI);
+Buffer<float, 1> bias(CO), bias1(CO), bias2(CO);
+Buffer<float, 4> output(CO, W, H, N), output1(CO, W, H, N), output2(CO, W, H, N);
+
+void initializeHalideConvLayerBenchmark(char **argv) {
+    for (int c = 0; c < input.dim(3).extent(); c++) {
+        for (int z = 0; z < input.channels(); z++) {
+            for (int y = 0; y < input.height(); y++) {
+                for (int x = 0; x < input.width(); x++) {
+                    input(x, y, z, c) = rand();
+                    input1(x, y, z, c) = input(x, y, z, c);
+                    input2(x, y, z, c) = input(x, y, z, c);
+                }
+            }
+        }
+    }
+
+    for (int c = 0; c < filter.dim(3).extent(); c++) {
+        for (int z = 0; z < filter.channels(); z++) {
+            for (int y = 0; y < filter.height(); y++) {
+                for (int x = 0; x < filter.width(); x++) {
+                    filter(x, y, z, c) = rand();
+                    filter1(x, y, z, c) = filter(x, y, z, c);
+                    filter2(x, y, z, c) = filter(x, y, z, c);
+                }
+            }
+        }
+    }
+
+    for (int x = 0; x < bias.width(); x++) {
+        bias(x) = rand();
+        bias1(x) = bias(x);
+        bias2(x) = bias(x);
+    }
+
+#ifdef _WIN32
+    _putenv_s("HL_CUDA_JIT_MAX_REGISTERS", "256");
+#else
+    setenv("HL_CUDA_JIT_MAX_REGISTERS", "256", 1);
+#endif
+}
+
+static void Halide_ConvLayer_NonSchedule(benchmark::State &state) {
+    for (auto _ : state) {
+        for (int i = 0; i < state.range(0); ++i) {
+            conv_layer_nonschedule(input, filter, bias, output);
+        }
+    }
+}
+
+static void Halide_ConvLayer_MaunallySchedule(benchmark::State &state) {
+    for (auto _ : state) {
+        for (int i = 0; i < state.range(0); ++i) {
+            conv_layer_manuallyschedule(input1, filter1, bias1, output1);
+        }
+    }
+}
+
+static void Halide_ConvLayer_AutoSchedule(benchmark::State &state) {
+    for (auto _ : state) {
+        for (int i = 0; i < state.range(0); ++i) {
+            conv_layer_autoschedule(input2, filter2, bias2, output2);
+        }
+    }
+}
+
+// Register benchmarking function.
+void registerBenchmarkHalideConvLayer() {
+    BENCHMARK(Halide_ConvLayer_NonSchedule)->Arg(1)->Unit(benchmark::kMillisecond);
+    BENCHMARK(Halide_ConvLayer_MaunallySchedule)->Arg(1)->Unit(benchmark::kMillisecond);
+    BENCHMARK(Halide_ConvLayer_AutoSchedule)->Arg(1)->Unit(benchmark::kMillisecond);
+}
+
diff --git a/benchmarks/DeepLearning/Layers/Main.cpp b/benchmarks/DeepLearning/Layers/Main.cpp
new file mode 100644
index 00000000..e9ce46db
--- /dev/null
+++ b/benchmarks/DeepLearning/Layers/Main.cpp
@@ -0,0 +1,46 @@
+//===- Main.cpp -----------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the main file of the Halide Conv Layer benchmark.
+//
+//===----------------------------------------------------------------------===//
+
+#include <benchmark/benchmark.h>
+#include <stdexcept>
+
+void initializeHalideConvLayerBenchmark(char **);
+
+void registerBenchmarkHalideConvLayer();
+
+// Run benchmarks.
+int main(int argc, char **argv) {
+  if (argc != 1) {
+    throw std::invalid_argument(
+        "No arguments needed.\n");
+  }
+
+  initializeHalideConvLayerBenchmark(argv);
+
+  // Register Benchmark Function.
+  registerBenchmarkHalideConvLayer();
+
+  ::benchmark::Initialize(&argc, argv);
+  ::benchmark::RunSpecifiedBenchmarks();
+
+  // Generate result.
+
+  return 0;
+}
diff --git a/benchmarks/DeepLearning/Layers/conv_layer_generator-autoschedule.cpp b/benchmarks/DeepLearning/Layers/conv_layer_generator-autoschedule.cpp
new file mode 100644
index 00000000..9c13b634
--- /dev/null
+++ b/benchmarks/DeepLearning/Layers/conv_layer_generator-autoschedule.cpp
@@ -0,0 +1,71 @@
+#include "Halide.h"
+
+namespace {
+
+using namespace Halide;
+
+class ConvolutionLayer : public Halide::Generator<ConvolutionLayer> {
+public:
+    Input<Buffer<float, 4>> input{"input"};
+    Input<Buffer<float, 4>> filter{"filter"};
+    Input<Buffer<float, 1>> bias{"bias"};
+    Output<Buffer<float, 4>> relu{"relu"};
+
+    void generate() {
+        const int N = 5, CI = 128, CO = 128, W = 100, H = 80;
+
+        /* THE ALGORITHM */
+
+        Var x("x"), y("y"), c("c"), n("n");
+
+        Func conv("conv");
+        RDom r(0, CI, 0, 3, 0, 3);
+
+        conv(c, x, y, n) = bias(c);
+        conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n);
+
+        relu(c, x, y, n) = max(0, conv(c, x, y, n));
+
+        /* THE SCHEDULE */
+
+        relu.dim(0).set_bounds(0, CO).set_stride(1);
+        relu.dim(1).set_bounds(0, W).set_stride(CO);
+        relu.dim(2).set_bounds(0, H).set_stride(CO * W);
+        relu.dim(3).set_bounds(0, N).set_stride(CO * H * W);
+
+        input.dim(0).set_bounds(0, CI).set_stride(1);
+        input.dim(1).set_bounds(0, W + 2).set_stride(CI);
+        input.dim(2).set_bounds(0, H + 2).set_stride(CI * (W + 2));
+        input.dim(3).set_bounds(0, N).set_stride(CI * (W + 2) * (H + 2));
+
+        filter.dim(0).set_bounds(0, CO).set_stride(1);
+        filter.dim(1).set_bounds(0, 3).set_stride(CO);
+        filter.dim(2).set_bounds(0, 3).set_stride(CO * 3);
+        filter.dim(3).set_bounds(0, CI).set_stride(CO * 3 * 3);
+
+        bias.dim(0).set_bounds(0, CO).set_stride(1);
+
+        if (using_autoscheduler()) {
+            input.dim(0).set_estimate(0, CI);
+            input.dim(1).set_estimate(0, W + 2);
+            input.dim(2).set_estimate(0, H + 2);
+            input.dim(3).set_estimate(0, N);
+
+            filter.dim(0).set_estimate(0, CO);
+            filter.dim(1).set_estimate(0, 3);
+            filter.dim(2).set_estimate(0, 3);
+            filter.dim(3).set_estimate(0, CI);
+
+            bias.dim(0).set_estimate(0, CO);
+
+            relu.dim(0).set_estimate(0, W);
+            relu.dim(1).set_estimate(0, H);
+            relu.dim(2).set_estimate(0, CO);
+            relu.dim(3).set_estimate(0, N);
+        }
+    }
+};
+
+}  // namespace
+
+HALIDE_REGISTER_GENERATOR(ConvolutionLayer, conv_layer_autoschedule)
diff --git a/benchmarks/DeepLearning/Layers/conv_layer_generator-manually.cpp b/benchmarks/DeepLearning/Layers/conv_layer_generator-manually.cpp
new file mode 100644
index 00000000..33d43e62
--- /dev/null
+++ b/benchmarks/DeepLearning/Layers/conv_layer_generator-manually.cpp
@@ -0,0 +1,185 @@
+#include "Halide.h"
+
+namespace {
+
+using namespace Halide;
+
+class ConvolutionLayer : public Halide::Generator<ConvolutionLayer> {
+public:
+    Input<Buffer<float, 4>> input{"input"};
+    Input<Buffer<float, 4>> filter{"filter"};
+    Input<Buffer<float, 1>> bias{"bias"};
+    Output<Buffer<float, 4>> relu{"relu"};
+
+    void generate() {
+        const int N = 5, CI = 128, CO = 128, W = 100, H = 80;
+
+        /* THE ALGORITHM */
+
+        Var x("x"), y("y"), c("c"), n("n");
+
+        Func conv("conv");
+        RDom r(0, CI, 0, 3, 0, 3);
+
+        conv(c, x, y, n) = bias(c);
+        conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n);
+
+        relu(c, x, y, n) = max(0, conv(c, x, y, n));
+
+        /* THE SCHEDULE */
+
+        relu.dim(0).set_bounds(0, CO).set_stride(1);
+        relu.dim(1).set_bounds(0, W).set_stride(CO);
+        relu.dim(2).set_bounds(0, H).set_stride(CO * W);
+        relu.dim(3).set_bounds(0, N).set_stride(CO * H * W);
+
+        input.dim(0).set_bounds(0, CI).set_stride(1);
+        input.dim(1).set_bounds(0, W + 2).set_stride(CI);
+        input.dim(2).set_bounds(0, H + 2).set_stride(CI * (W + 2));
+        input.dim(3).set_bounds(0, N).set_stride(CI * (W + 2) * (H + 2));
+
+        filter.dim(0).set_bounds(0, CO).set_stride(1);
+        filter.dim(1).set_bounds(0, 3).set_stride(CO);
+        filter.dim(2).set_bounds(0, 3).set_stride(CO * 3);
+        filter.dim(3).set_bounds(0, CI).set_stride(CO * 3 * 3);
+
+        bias.dim(0).set_bounds(0, CO).set_stride(1);
+
+        if (get_target().has_feature(Target::CUDA)) {
+            // GPU schedule, tuned for a GTX 980. Seems to be good on
+            // an RTX 2060 too (About 90% peak flops on both cards).
+
+            // 1.87 ms on an RTX 2060. According to NVIDIA Nsight
+            // Compute we're at 91.5% utilization of the FMA units
+
+            // 2.41 ms on a GTX 980. According to nvprof this is about
+            // 88% of peak flops.
+
+            // We use cuda-specific scheduling directives (gpu_lanes),
+            // so this is not a general GPGPU schedule.
+
+            Var ni, no, xi, xo, yi, yo, ci, co, t;
+            RVar rxo, rxi, rxii;
+            relu.compute_root()
+                .split(x, xo, xi, 5)
+                .split(y, yo, yi, 5)
+                .split(c, co, ci, 32)
+                .reorder(xi, yi, ci, xo, yo, co, n)
+                .gpu_lanes(ci)
+                .unroll(xi)
+                .unroll(yi)
+                .fuse(co, n, t)
+                .gpu_blocks(xo, yo, t);
+
+            conv.compute_at(relu, xo)
+                .store_in(MemoryType::Register)
+                .gpu_lanes(c)
+                .unroll(x)
+                .unroll(y)
+                .update()
+                .split(r.x, rxo, rxi, 16)
+                .split(rxi, rxi, rxii, 2)
+                .reorder(c, rxii, x, y, r.y, r.z, rxi, rxo)
+                .gpu_lanes(c)
+                .unroll(x)
+                .unroll(y)
+                .unroll(r.y)
+                .unroll(r.z)
+                .unroll(rxii);
+
+            input.in()
+                .compute_at(conv, rxo)
+                .vectorize(_0, 2)
+                .split(_1, xo, xi, 4)
+                .fuse(_0, xi, t)
+                .gpu_lanes(t)
+                .unroll(xo)
+                .unroll(_2);
+
+        } else {
+
+            // 4.06ms on an Intel i9-9960X using 16 threads at 3.0 GHz,
+            // which is 94.5% of peak flops assuming the math below is correct:
+
+            // 16 cores times 2 FMAs per cycle times 3G cycles per
+            // second times 16 vector lanes is a peak throughput of
+            // 1.536 TFlops.
+
+            // This conv does N * CI * CO * W * H * 3 * 3 = 5 * 128 *
+            // 128 * 100 * 80 * 3 * 3 FMAs in 4.06ms is 1.453 TFlops.
+
+            // The ratio of actual to theoretical flops hit is 0.9458
+
+            int tile_w = 1;
+            int tile_h = 1;
+            const int vec = natural_vector_size<float>();
+
+            if (get_target().has_feature(Target::AVX512_Skylake) ||
+                (get_target().arch == Target::ARM &&
+                 get_target().bits == 64)) {
+                // On Skylake we have one load per fma and 32
+                // registers available, so there's considerable
+                // flexibility in the schedule. We'll use 20 accumulator
+                // registers in a 4x5 tile. This is also a reasonable
+                // choice for ARMv8, which also has 32 registers.
+                tile_w = 4;
+                tile_h = 5;
+            } else if (get_target().arch == Target::X86) {
+                // With 16-register ISAs like x86 with AVX2, we can
+                // only do one load per two fmas, which constrains the
+                // schedule to have to be a squarish 12-register tile
+                // of the output.
+                tile_w = 3;
+                tile_h = 4;
+            } else {
+                // The above should also be reasonable schedule for
+                // ARMv7 and other 16-register machines, but I see
+                // some spills on arm-32, so we use a 2x4 block of 8
+                // accumulators instead. This could probably be better
+                // tuned, because in principle 12 accumulators should
+                // be possible. I believe the issue is that there's no
+                // fused multiply-add instruction, and so we're
+                // fighting llvm's instruction scheduler, which wants
+                // to move the muls well ahead of the adds to cover
+                // instruction latencies.
+                tile_w = 2;
+                tile_h = 4;
+            }
+
+            Var co, ci, xo, xi, yo, yi, t;
+            relu.split(c, co, ci, vec * tile_w)
+                .split(x, xo, xi, tile_h)
+                .reorder(ci, xi, xo, y, n, co)
+                .vectorize(ci, vec)
+                .unroll(ci)
+                .unroll(xi)
+                .parallel(y)
+                .parallel(n)
+                .parallel(co);
+            conv.compute_at(relu, xo)
+                .vectorize(c, vec)
+                .unroll(c)
+                .unroll(x)
+                .unroll(y)
+                .update()
+                .reorder(c, x, y, r.x, r.y, r.z, n)
+                .vectorize(c, vec)
+                .unroll(c)
+                .unroll(x)
+                .unroll(y)
+                .unroll(r.x, 2);
+            filter.in()
+                .compute_at(conv, r.x)
+                .vectorize(_0, vec)
+                .unroll(_0)
+                .unroll(_3);
+            input.in()
+                .compute_at(conv, x)
+                .unroll(_0);
+        }
+    }
+};
+
+}  // namespace
+
+HALIDE_REGISTER_GENERATOR(ConvolutionLayer, conv_layer_manuallyschedule)
diff --git a/benchmarks/DeepLearning/Layers/conv_layer_generator.cpp b/benchmarks/DeepLearning/Layers/conv_layer_generator.cpp
new file mode 100644
index 00000000..2acd913a
--- /dev/null
+++ b/benchmarks/DeepLearning/Layers/conv_layer_generator.cpp
@@ -0,0 +1,52 @@
+#include "Halide.h"
+
+namespace {
+
+using namespace Halide;
+
+class ConvolutionLayer : public Halide::Generator<ConvolutionLayer> {
+public:
+    Input<Buffer<float, 4>> input{"input"};
+    Input<Buffer<float, 4>> filter{"filter"};
+    Input<Buffer<float, 1>> bias{"bias"};
+    Output<Buffer<float, 4>> relu{"relu"};
+
+    void generate() {
+        const int N = 5, CI = 128, CO = 128, W = 100, H = 80;
+
+        /* THE ALGORITHM */
+
+        Var x("x"), y("y"), c("c"), n("n");
+
+        Func conv("conv");
+        RDom r(0, CI, 0, 3, 0, 3);
+
+        conv(c, x, y, n) = bias(c);
+        conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n);
+
+        relu(c, x, y, n) = max(0, conv(c, x, y, n));
+
+        /* THE SCHEDULE */
+
+        relu.dim(0).set_bounds(0, CO).set_stride(1);
+        relu.dim(1).set_bounds(0, W).set_stride(CO);
+        relu.dim(2).set_bounds(0, H).set_stride(CO * W);
+        relu.dim(3).set_bounds(0, N).set_stride(CO * H * W);
+
+        input.dim(0).set_bounds(0, CI).set_stride(1);
+        input.dim(1).set_bounds(0, W + 2).set_stride(CI);
+        input.dim(2).set_bounds(0, H + 2).set_stride(CI * (W + 2));
+        input.dim(3).set_bounds(0, N).set_stride(CI * (W + 2) * (H + 2));
+
+        filter.dim(0).set_bounds(0, CO).set_stride(1);
+        filter.dim(1).set_bounds(0, 3).set_stride(CO);
+        filter.dim(2).set_bounds(0, 3).set_stride(CO * 3);
+        filter.dim(3).set_bounds(0, CI).set_stride(CO * 3 * 3);
+
+        bias.dim(0).set_bounds(0, CO).set_stride(1);
+    }
+};
+
+}  // namespace
+
+HALIDE_REGISTER_GENERATOR(ConvolutionLayer, conv_layer_nonschedule)