diff --git a/samples/core/CMakeLists.txt b/samples/core/CMakeLists.txt
index 75d8357d..558902e5 100644
--- a/samples/core/CMakeLists.txt
+++ b/samples/core/CMakeLists.txt
@@ -16,6 +16,7 @@ add_subdirectory(enumopencl)
 add_subdirectory(copybuffer)
 add_subdirectory(copybufferkernel)
 add_subdirectory(saxpy)
+add_subdirectory(multi-device)
 add_subdirectory(reduce)
 add_subdirectory(blur)
 add_subdirectory(binaries)
diff --git a/samples/core/multi-device/CMakeLists.txt b/samples/core/multi-device/CMakeLists.txt
new file mode 100644
index 00000000..ccf8933b
--- /dev/null
+++ b/samples/core/multi-device/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright (c) 2023 The Khronos Group Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_sample(
+    TEST
+    TARGET multidevice
+    VERSION 300
+    SOURCES main.c
+    KERNELS convolution.cl)
+
+add_sample(
+    TEST
+    TARGET multidevicecpp
+    VERSION 300
+    SOURCES main.cpp
+    KERNELS convolution.cl)
diff --git a/samples/core/multi-device/README.md b/samples/core/multi-device/README.md
new file mode 100644
index 00000000..7029becb
--- /dev/null
+++ b/samples/core/multi-device/README.md
@@ -0,0 +1,131 @@
+# Multi-device Convolution Example
+
+## Sample purpose
+This example showcases how to set up a multi-device execution of a given kernel, being the latter a convolution kernel in this particular case.
+
+## Key APIs and Concepts
+The main idea behind this example is that a given kernel can be run simultaneously by two (or potentially more) devices, therefore reducing its execution time. One can essentially think of two strategies for this workflow:
+1. each device computes its proportional part of the solution at its own speed and the results are combined on the host's side when finished, and
+2. each device executes the kernel at its own speed but after each iteration there is P2P communication between the devices to share the partial results.
+
+This example implements the first approach.
+
+### Kernel logic
+The kernel is a simple $3 \times 3$ convolution, meaning that the convolution over the input matrix is performed using a $3 \times 3$ mask matrix.
+
+In this implementation of the convolution kernel we assume that the input matrix is padded with 0s, so no extra conditional logic is necessary to ensure that the mask is applied to out-of-bounds elements (e.g. when processing element $(0,0)$ of the output matrix).
+
+## Application flow
+### Overview
+By default the application will select whichever two devices are first found in the platform with most devices available. A command-line option is added though, so the user can specify which type of device is preferable to be used (e.g. "cpu" or "gpu").
+
+A random input matrix and mask are generated and the workload is equally divided between both devices: one of them performs the convolution over the left half of the matrix and the other one does the same with the right half. When both devices finish the execution of the kernel, the results are fetched and combined by the host.
+
+### Device selection
+As mentioned above, by default the first two devices of any type found will be the ones to be used. If there is only one device available, a single-device convolution will be performed.
+
+If the user specifies a type of device to be used, the program iterates over all the devices available and selects the first two devices of that type found from the platform with most devices of that type available. If there are not enough devices (i.e. there is no more that one device of that type in any of the platforms) then the example will run as a single-device program.
+
+### Kernel launch
+The rest of the program does not differ much from the usual single-device kernel launch. The only difference is that each device will need a separate set of runtime objects to be created: context, program, command queue, kernel functor, input/mask/output buffers and so on. Note that the input buffers will also contain different data in them, because the first device selected performs the convolution over the left half of the matrix while the second device calculates the convolution over the right half of it.
+
+The kernel is then enqueued to the command queues of each device, and two different events are used to wait for them to be finished. When the devices finish the computations the results are combined in a single host matrix and compared to the host-side results.
+
+## Used API surface
+### C
+```c
+CL_BLOCKING
+CL_DEVICE_PLATFORM
+CL_DEVICE_TYPE_ALL
+CL_HPP_TARGET_OPENCL_VERSION
+CL_INVALID_ARG_VALUE
+CL_KERNEL_WORK_GROUP_SIZE
+CL_MEM_COPY_HOST_PTR
+CL_MEM_HOST_READ_ONLY
+CL_MEM_READ_ONLY
+CL_MEM_WRITE_ONLY
+CL_PROFILING_COMMAND_END
+CL_PROFILING_COMMAND_START
+CL_QUEUE_PROFILING_ENABLE
+CL_QUEUE_PROPERTIES
+CL_SUCCESS
+cl_command_queue
+cl_command_queue_properties
+cl_context
+cl_device_type
+cl_event
+cl_float
+cl_int
+cl_kernel
+cl_mem
+cl_platform_id
+cl_program
+cl_sdk_fill_with_random_ints_range(pcg32_random_t*, cl_int*, size_t, cl_int, cl_int)
+cl_sdk_options_DeviceTriplet
+cl_sdk_options_Diagnostic
+cl_sdk_options_MultiDevice
+cl_uint
+cl_ulong
+cl_util_build_program(cl_program, cl_device_id, char*)
+cl_util_get_device(cl_uint, cl_uint, cl_device_type, cl_int*)
+cl_util_get_event_duration(cl_event, cl_profiling_info, cl_profiling_info, cl_int*)
+cl_util_print_device_info*(cl_device_id)
+cl_util_print_error(cl_int)
+cl_util_read_text_file(char*const, size_t*const, cl_int*)
+get_dev_type(char*)
+clCreateBuffer(cl_context, cl_mem_flags, size_t, void*, cl_int*)
+clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int*)
+clCreateCommandQueueWithProperties(cl_context, cl_device_id, cl_queue_properties*, cl_int*) -> OpenCL >= 2.0
+clCreateContext(cl_context_properties*, cl_uint, cl_device_id*, void *(char*, void*,size_t, void*), void*, cl_int*)
+clCreateKernel(cl_program, char*, cl_int*)
+clGetKernelWorkGroupInfo(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void*, size_t*)
+clCreateProgramWithSource(cl_context, cl_uint, char**, size_t*, cl_int*)
+clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, size_t*, size_t*, size_t*, cl_uint, cl_event*, cl_event*)
+clEnqueueReadBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, cl_event*, cl_event*)
+clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id*, cl_uint*)
+clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void*, size_t*)
+clGetPlatformIDs(cl_uint, cl_platform_id*, cl_uint*)
+clReleaseCommandQueue(cl_command_queue)
+clReleaseContext(cl_context)
+clReleaseKernel(cl_kernel)
+clReleaseMemObject(cl_mem)
+clReleaseProgram(cl_program)
+clSetKernelArg(cl_kernel, cl_uint, size_t, void *)
+clWaitForEvents(cl_uint, cl_event*)
+```
+
+### C++
+```c++
+cl::Buffer::Buffer(const Context&, IteratorType, IteratorType, bool, bool=false, cl_int*=NULL)
+cl::BuildError
+cl::CommandQueue::CommandQueue(const cl::Context&, cl::QueueProperties, cl_int*=NULL)
+cl::Context
+cl::Device::Device()
+cl::EnqueueArgs::EnqueueArgs(cl::CommandQueue&, cl::NDRange, cl::NDRange)
+cl::Error
+cl::Event
+cl::Kernel
+cl::KernelFunctor::KernelFunctor(const Program&, const string, cl_int*=NULL)
+cl::NDRange::NDRange(size_t, size_t)
+cl::NullRange
+cl::Platform::Platform()
+cl::Platform::Platform(cl::Platform)
+cl::Platform::get(vector<cl::Platform>*)
+cl::Program::Program()
+cl::Program::Program(cl::Program)
+cl::WaitForEvents(const vector<cl::Event>&)
+cl::copy(const CommandQueue&, const cl::Buffer&, IteratorType, IteratorType)
+cl::sdk::comprehend()
+cl::sdk::fill_with_random()
+cl::sdk::get_context(cl_uint, cl_uint, cl_device_type, cl_int*)
+cl::sdk::options::MultiDevice
+cl::sdk::parse()
+cl::sdk::parse_cli()
+cl::sdk::options::DeviceTriplet
+cl::sdk::options::Diagnostic
+cl::sdk::options::SingleDevice
+cl::string::string(cl::string)
+cl::util::Error
+cl::util::get_duration(cl::Event&)
+cl::util::opencl_c_version_contains(const cl::Device&, const cl::string&)
+```
diff --git a/samples/core/multi-device/convolution.cl b/samples/core/multi-device/convolution.cl
new file mode 100644
index 00000000..019855cb
--- /dev/null
+++ b/samples/core/multi-device/convolution.cl
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+kernel void convolution_3x3(const global float* in, global float* out,
+                            const global float* mask, const uint2 out_dim)
+{
+    const uint2 gid = (uint2)(get_global_id(0), get_global_id(1));
+
+    const uint mask_dim = 3;
+    const uint pad_width = mask_dim / 2;
+    const uint2 in_dim = out_dim + (mask_dim / 2) * 2;
+
+    // Check possible out of bounds.
+    if (!(gid.x >= out_dim.x || gid.y >= out_dim.y))
+    {
+        return;
+    }
+
+    // Perform convolution. Fix one column at a time and iterate over each
+    // element of it, as data is stored column-major.
+    float result = 0.0f;
+    #if __OPENCL_C_VERSION__ >= 200
+    __attribute__((opencl_unroll_hint))
+    #endif
+    for (uint y = 0 ; y < mask_dim; ++y)
+    {
+        #if __OPENCL_C_VERSION__ >= 200
+        __attribute__((opencl_unroll_hint))
+        #endif
+        for (uint x = 0 ; x < mask_dim; ++x)
+        {
+            result += mask[y * mask_dim + x]
+                * in[(gid.y + y) * in_dim.x + gid.x + x];
+        }
+    }
+
+    // Write result to correspoding output cell.
+    out[gid.y * out_dim.x + gid.x] = result;
+}
diff --git a/samples/core/multi-device/main.c b/samples/core/multi-device/main.c
new file mode 100644
index 00000000..a45190a5
--- /dev/null
+++ b/samples/core/multi-device/main.c
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// OpenCL SDK includes.
+#include <CL/SDK/CLI.h>
+#include <CL/SDK/Context.h>
+#include <CL/SDK/Options.h>
+#include <CL/SDK/Random.h>
+
+// OpenCL Utils includes.
+#include <CL/Utils/Error.h>
+#include <CL/Utils/Event.h>
+#include <CL/Utils/Utils.h>
+
+// Standard header includes.
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// Sample-specific options.
+struct convolution_options
+{
+    size_t x_dim;
+    size_t y_dim;
+};
+
+// Add option to CLI-parsing SDK utility for input dimensions.
+cag_option ConvolutionOptions[] = { { .identifier = 'x',
+                                      .access_letters = "x",
+                                      .access_name = "x_dim",
+                                      .value_name = "(positive integral)",
+                                      .description = "x dimension of input" },
+
+                                    { .identifier = 'y',
+                                      .access_letters = "y",
+                                      .access_name = "y_dim",
+                                      .value_name = "(positive integral)",
+                                      .description = "y dimension of input" } };
+
+ParseState parse_ConvolutionOptions(const char identifier,
+                                    cag_option_context* cag_context,
+                                    struct convolution_options* opts)
+{
+    const char* value;
+
+    switch (identifier)
+    {
+        case 'x':
+            if ((value = cag_option_get_value(cag_context)))
+            {
+                opts->x_dim = strtoull(value, NULL, 0);
+                return ParsedOK;
+            }
+            else
+                return ParseError;
+        case 'y':
+            if ((value = cag_option_get_value(cag_context)))
+            {
+                opts->y_dim = strtoull(value, NULL, 0);
+                return ParsedOK;
+            }
+            else
+                return ParseError;
+    }
+    return NotParsed;
+}
+
+cl_int parse_options(int argc, char* argv[],
+                     struct cl_sdk_options_Diagnostic* diag_opts,
+                     struct cl_sdk_options_SingleDevice* dev_opts,
+                     struct convolution_options* convolution_opts)
+{
+    cl_int error = CL_SUCCESS;
+    struct cag_option *opts = NULL, *tmp = NULL;
+    size_t n = 0;
+
+    // Prepare options array.
+    MEM_CHECK(opts = add_CLI_options(opts, &n, DiagnosticOptions,
+                                     CAG_ARRAY_SIZE(DiagnosticOptions)),
+              error, end);
+    MEM_CHECK(tmp = add_CLI_options(opts, &n, SingleDeviceOptions,
+                                    CAG_ARRAY_SIZE(SingleDeviceOptions)),
+              error, end);
+    opts = tmp;
+    MEM_CHECK(tmp = add_CLI_options(opts, &n, ConvolutionOptions,
+                                    CAG_ARRAY_SIZE(ConvolutionOptions)),
+              error, end);
+    opts = tmp;
+
+    char identifier;
+    cag_option_context cag_context;
+
+    // Prepare the context and iterate over all options.
+    cag_option_prepare(&cag_context, opts, n, argc, argv);
+    while (cag_option_fetch(&cag_context))
+    {
+        ParseState state = NotParsed;
+        identifier = cag_option_get(&cag_context);
+
+        PARS_OPTIONS(parse_DiagnosticOptions(identifier, diag_opts), state);
+        PARS_OPTIONS(
+            parse_SingleDeviceOptions(identifier, &cag_context, dev_opts),
+            state);
+        PARS_OPTIONS(parse_ConvolutionOptions(identifier, &cag_context,
+                                              convolution_opts),
+                     state);
+
+        if (identifier == 'h')
+        {
+            printf("Usage: dev_optsdevice [OPTION]...\n");
+            printf("Option name and value should be separated by '=' or a "
+                   "space\n");
+            printf("Demonstrates convolution calculation with two "
+                   "(sub)devices.\n\n");
+            cag_option_print(opts, n, stdout);
+            exit((state == ParseError) ? CL_INVALID_ARG_VALUE : CL_SUCCESS);
+        }
+    }
+
+end:
+    free(opts);
+    return error;
+}
+
+// Host-side implementation of the convolution for verification. Padded input
+// assumed.
+void host_convolution(const cl_float* in, cl_float* out, const cl_float* mask,
+                      const size_t x_dim, const size_t y_dim)
+{
+    const cl_uint mask_dim = 3;
+    const cl_uint pad_width = mask_dim / 2;
+    const size_t pad_x_dim = x_dim + 2 * pad_width;
+    for (size_t x = 0; x < x_dim; ++x)
+    {
+        for (size_t y = 0; y < y_dim; ++y)
+        {
+            float result = 0.f;
+            for (size_t grid_column = x, mask_column = 0;
+                 mask_column < mask_dim; ++grid_column, ++mask_column)
+            {
+                for (size_t grid_row = y, mask_row = 0; mask_row < mask_dim;
+                     ++grid_row, ++mask_row)
+                {
+                    result += mask[mask_column + mask_row * mask_dim]
+                        * in[grid_column + grid_row * pad_x_dim];
+                }
+            }
+            out[x + y * x_dim] = result;
+        }
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    cl_int error = CL_SUCCESS;
+    cl_int end_error = CL_SUCCESS;
+    cl_device_id dev;
+    cl_context context;
+    cl_program program;
+    cl_mem dev_input_grid, dev_output_grid, dev_mask;
+
+    cl_kernel convolutions[2] = { 0 };
+    cl_command_queue sub_queues[2] = { 0 };
+    cl_mem sub_input_grids[2] = { 0 };
+    cl_mem sub_output_grids[2] = { 0 };
+    cl_mem sub_masks[2] = { 0 };
+    cl_event events[2] = { 0 };
+
+    // Parse command-line options.
+    struct cl_sdk_options_Diagnostic diag_opts = { .quiet = false,
+                                                   .verbose = false };
+
+    // By default assume that there is only one device available.
+    // dev_opts->number is set to 1 so that when calling to cl_util_get_device
+    // for the second device there is no index out of range.
+    struct cl_sdk_options_SingleDevice dev_opts = {
+        .triplet = { 0, 0, CL_DEVICE_TYPE_ALL }
+    };
+    struct convolution_options convolution_opts = { .x_dim = 4000,
+                                                    .y_dim = 4000 };
+
+    OCLERROR_RET(
+        parse_options(argc, argv, &diag_opts, &dev_opts, &convolution_opts),
+        error, end);
+
+    // Create runtime objects based on user preference or default.
+    OCLERROR_PAR(dev = cl_util_get_device(dev_opts.triplet.plat_index,
+                                          dev_opts.triplet.dev_index,
+                                          dev_opts.triplet.dev_type, &error),
+                 error, dev);
+
+    if (!diag_opts.quiet)
+    {
+        cl_util_print_device_info(dev);
+    }
+
+    if (diag_opts.verbose)
+    {
+        printf("Creating sub-devices...");
+        fflush(stdout);
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION < 120
+    fprintf(stderr,
+            "Error: OpenCL subdevices not supported before version 1.2 ");
+    exit(EXIT_FAILURE);
+#endif
+
+    // Create subdevices, each with half of the compute units available.
+    cl_uint subdev_count = 0, max_compute_units = 0;
+    OCLERROR_RET(clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                 sizeof(cl_uint), &max_compute_units, NULL),
+                 error, dev);
+    cl_device_partition_property subdevices_properties[] = {
+        (cl_device_partition_property)CL_DEVICE_PARTITION_EQUALLY,
+        (cl_device_partition_property)max_compute_units / 2, 0
+    };
+    OCLERROR_RET(clCreateSubDevices(dev, subdevices_properties,
+                                    max_compute_units, NULL, &subdev_count),
+                 error, dev);
+
+    if (subdev_count < 2)
+    {
+        fprintf(stderr, "Error: OpenCL cannot create subdevices");
+        exit(EXIT_FAILURE);
+    }
+
+    cl_device_id* subdevices =
+        (cl_device_id*)malloc(subdev_count * sizeof(cl_device_id));
+    OCLERROR_RET(clCreateSubDevices(dev, subdevices_properties, subdev_count,
+                                    subdevices, NULL),
+                 error, subdevs);
+
+    OCLERROR_PAR(context = clCreateContext(NULL, subdev_count, subdevices, NULL,
+                                           NULL, &error),
+                 error, subdevs);
+
+    // Read kernel file.
+    const char* kernel_location = "./convolution.cl";
+    char *kernel = NULL, *tmp = NULL;
+    size_t program_size = 0;
+    OCLERROR_PAR(
+        kernel = cl_util_read_text_file(kernel_location, &program_size, &error),
+        error, contx);
+    MEM_CHECK(tmp = (char*)realloc(kernel, program_size), error, ker);
+    kernel = tmp;
+
+    // Compile kernel.
+    if (diag_opts.verbose)
+    {
+        printf("done.\nCompiling kernel...");
+        fflush(stdout);
+    }
+    OCLERROR_PAR(program = clCreateProgramWithSource(
+                     context, 1, (const char**)&kernel, &program_size, &error),
+                 error, ker);
+
+    // If no -cl-std option is specified then the highest 1.x version
+    // supported by each device is used to compile the program. Therefore,
+    // it's only necessary to add the -cl-std option for 2.0 and 3.0 OpenCL
+    // versions.
+    char compiler_options[1023] = "";
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+    strcat(compiler_options, "-cl-std=CL3.0 ");
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+    strcat(compiler_options, "-cl-std=CL2.0 ");
+#endif
+
+    OCLERROR_RET(
+        clBuildProgram(program, 2, subdevices, compiler_options, NULL, NULL),
+        error, prg);
+
+    // Initialize host-side storage.
+    const cl_uint mask_dim = 3;
+    const cl_uint pad_width = mask_dim / 2;
+    const size_t x_dim = convolution_opts.x_dim;
+    const size_t y_dim = convolution_opts.y_dim;
+    const size_t pad_x_dim = x_dim + 2 * pad_width;
+    const size_t pad_y_dim = y_dim + 2 * pad_width;
+
+    const size_t input_size = sizeof(cl_float) * pad_x_dim * pad_y_dim;
+    const size_t output_size = sizeof(cl_float) * x_dim * y_dim;
+    const size_t mask_size = sizeof(cl_float) * mask_dim * mask_dim;
+
+    if (diag_opts.verbose)
+    {
+        printf("done.\nInitializing host-side storage...\n");
+        fflush(stdout);
+    }
+
+    // Random number generator.
+    pcg32_random_t rng;
+    pcg32_srandom_r(&rng, 11111, -2222);
+
+    cl_float* h_input_grid;
+    cl_float* h_output_grid;
+    cl_float* h_mask;
+
+    // Initialize input matrix. The input will be padded to remove
+    // conditional branches from the convolution kernel for determining
+    // out-of-bounds.
+    MEM_CHECK(h_input_grid = (cl_float*)malloc(input_size), error, prg);
+    if (diag_opts.verbose)
+    {
+        printf("  Generating %zu random numbers for convolution input grid...",
+               x_dim * y_dim);
+        fflush(stdout);
+    }
+    cl_sdk_fill_with_random_ints_range(&rng, (cl_int*)h_input_grid,
+                                       pad_x_dim * pad_y_dim, -1000, 1000);
+    // Fill with 0s the extra rows and columns added for padding.
+    for (size_t j = 0; j < pad_x_dim; ++j)
+    {
+        for (size_t i = 0; i < pad_y_dim; ++i)
+        {
+            if (i == 0 || j == 0 || i == (pad_y_dim - 1)
+                || j == (pad_x_dim - 1))
+            {
+                h_input_grid[j + i * pad_x_dim] = 0;
+            }
+        }
+    }
+
+    // Declare output matrix. Output will not be padded.
+    MEM_CHECK(h_output_grid = (cl_float*)malloc(output_size), error, hinput);
+
+    // Initialize convolution mask.
+    MEM_CHECK(h_mask = (cl_float*)malloc(mask_size), error, houtput);
+    if (diag_opts.verbose)
+    {
+        printf("done.\n  Generating %u random numbers for convolution mask...",
+               mask_dim * mask_dim);
+        fflush(stdout);
+    }
+    cl_sdk_fill_with_random_ints_range(&rng, (cl_int*)h_mask,
+                                       mask_dim * mask_dim, -1000, 1000);
+
+    // Create device buffers, from which we will create the subbuffers for the
+    // subdevices.
+    const size_t grid_midpoint = y_dim / 2;
+    const size_t pad_grid_midpoint = pad_y_dim / 2;
+
+    if (diag_opts.verbose)
+    {
+        printf("done.\nInitializing device-side storage...");
+        fflush(stdout);
+    }
+
+    OCLERROR_PAR(dev_input_grid = clCreateBuffer(
+                     context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                     input_size, h_input_grid, &error),
+                 error, hmask);
+    OCLERROR_PAR(dev_output_grid = clCreateBuffer(
+                     context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY,
+                     output_size, NULL, &error),
+                 error, bufin);
+    OCLERROR_PAR(dev_mask = clCreateBuffer(
+                     context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, mask_size,
+                     h_mask, &error),
+                 error, bufout);
+
+    if (diag_opts.verbose)
+    {
+        printf("done.\nSetting up sub-devices...");
+        fflush(stdout);
+    }
+
+    // Set up subdevices for kernel execution.
+    const size_t half_input_size =
+        sizeof(cl_float) * pad_x_dim * (pad_grid_midpoint + 1);
+    const size_t input_offset =
+        sizeof(cl_float) * pad_x_dim * (pad_grid_midpoint - 1);
+    size_t subdevice = 0;
+    for (; subdevice < subdev_count; ++subdevice)
+    {
+        // Create kernel.
+        if (diag_opts.verbose)
+        {
+            printf("\n  Creating kernel and command queue of sub-device %ld...", subdevice);
+            fflush(stdout);
+        }
+
+        OCLERROR_PAR(convolutions[subdevice] =
+                         clCreateKernel(program, "convolution_3x3", &error),
+                     error, bufmask);
+
+        // Initialize queues for command execution on each device.
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_command_queue_properties props[] = { CL_QUEUE_PROPERTIES,
+                                                CL_QUEUE_PROFILING_ENABLE, 0 };
+        OCLERROR_PAR(
+            sub_queues[subdevice] = clCreateCommandQueueWithProperties(
+                context, subdevices[subdevice], props, &error),
+            error, conv);
+#else
+        OCLERROR_PAR(sub_queues[subdevice] = clCreateCommandQueue(
+                         context, subdevices[subdevice],
+                         CL_QUEUE_PROFILING_ENABLE, &error),
+                     error, conv);
+#endif
+
+        // Initialize device-side storage.
+        // First device performs the convolution in the upper half and second
+        // device in the lower half (middle borders included).
+        if (diag_opts.verbose)
+        {
+            printf("done.\n  Initializing device-side storage of sub-device "
+                   "%ld...",
+                   subdevice);
+            fflush(stdout);
+        }
+
+        cl_buffer_region input_region = { subdevice * input_offset,
+                                          half_input_size },
+                         output_region = { 0, output_size },
+                         mask_region = { 0, mask_size };
+        OCLERROR_PAR(sub_input_grids[subdevice] = clCreateSubBuffer(
+                         dev_input_grid, CL_MEM_READ_ONLY,
+                         CL_BUFFER_CREATE_TYPE_REGION, &input_region, &error),
+                     error, subqueue);
+        OCLERROR_PAR(sub_output_grids[subdevice] = clCreateSubBuffer(
+                         dev_output_grid,
+                         CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY,
+                         CL_BUFFER_CREATE_TYPE_REGION, &output_region, &error),
+                     error, subbufin);
+        OCLERROR_PAR(sub_masks[subdevice] = clCreateSubBuffer(
+                         dev_mask, CL_MEM_READ_ONLY,
+                         CL_BUFFER_CREATE_TYPE_REGION, &mask_region, &error),
+                     error, subbufout);
+
+        if (diag_opts.verbose)
+        {
+            printf("done.");
+            fflush(stdout);
+        }
+
+        // Set kernels arguments.
+        OCLERROR_RET(clSetKernelArg(convolutions[subdevice], 0,
+                                    sizeof(cl_mem),
+                                    &sub_input_grids[subdevice]),
+                     error, subbufmask);
+        OCLERROR_RET(clSetKernelArg(convolutions[subdevice], 1,
+                                    sizeof(cl_mem),
+                                    &sub_output_grids[subdevice]),
+                     error, subbufmask);
+        OCLERROR_RET(clSetKernelArg(convolutions[subdevice], 2,
+                                    sizeof(cl_mem),
+                                    &sub_masks[subdevice]),
+                     error, subbufmask);
+        OCLERROR_RET(clSetKernelArg(convolutions[subdevice], 3,
+                                    sizeof(size_t), &x_dim),
+                     error, subbufmask);
+        OCLERROR_RET(clSetKernelArg(convolutions[subdevice], 4,
+                                    sizeof(size_t), &grid_midpoint),
+                     error, subbufmask);
+    }
+
+    // Launch kernels.
+    if (diag_opts.verbose)
+    {
+        printf("\nExecuting on device... ");
+        fflush(stdout);
+    }
+
+    // Enqueue kernel calls and wait for them to finish.
+    const size_t* global = (size_t[]){ pad_x_dim, pad_y_dim };
+
+    GET_CURRENT_TIMER(dev_start)
+    OCLERROR_RET(clEnqueueNDRangeKernel(sub_queues[0], convolutions[0], 2, NULL,
+                                        global, NULL, 0, NULL, &events[0]),
+                 error, subbufmask);
+    OCLERROR_RET(clEnqueueNDRangeKernel(sub_queues[1], convolutions[1], 2, NULL,
+                                        global, NULL, 0, NULL, &events[1]),
+                 error, event1);
+
+    OCLERROR_RET(clWaitForEvents(1, &events[0]), error, event1);
+    OCLERROR_RET(clWaitForEvents(1, &events[1]), error, event2);
+    GET_CURRENT_TIMER(dev_end)
+    cl_ulong dev_time;
+    TIMER_DIFFERENCE(dev_time, dev_start, dev_end)
+
+    // Compute reference host-side convolution.
+    if (diag_opts.verbose)
+    {
+        printf("done.\nExecuting on host... ");
+        fflush(stdout);
+    }
+
+    GET_CURRENT_TIMER(host_start)
+    host_convolution(h_input_grid, h_output_grid, h_mask, x_dim, y_dim);
+    GET_CURRENT_TIMER(host_end)
+    cl_ulong host_time;
+    TIMER_DIFFERENCE(host_time, host_start, host_end)
+
+    if (diag_opts.verbose)
+    {
+        printf("done.\n");
+        fflush(stdout);
+    }
+
+    // Fetch and combine results from devices.
+    cl_float* concatenated_results;
+    const size_t mid_output_count = x_dim * grid_midpoint;
+    const size_t mid_output_size = sizeof(cl_float) * mid_output_count;
+    MEM_CHECK(concatenated_results = (cl_float*)malloc(output_size), error,
+              event2);
+    OCLERROR_RET(clEnqueueReadBuffer(sub_queues[0], sub_output_grids[0],
+                                     CL_BLOCKING, 0, mid_output_size,
+                                     concatenated_results, 0, NULL, NULL),
+                 error, result);
+    OCLERROR_RET(clEnqueueReadBuffer(sub_queues[1], sub_output_grids[1],
+                                     CL_BLOCKING, mid_output_size, mid_output_size,
+                                     concatenated_results + mid_output_count, 0,
+                                     NULL, NULL),
+                 error, result);
+
+    // Validate device-side solution.
+    cl_float deviation = 0.f;
+    const cl_float tolerance = 1e-6;
+
+    for (size_t i = 0; i < x_dim * y_dim; ++i)
+    {
+        deviation += fabs(concatenated_results[i] - h_output_grid[i]);
+    }
+    deviation /= (x_dim * y_dim);
+
+    if (deviation > tolerance)
+    {
+        printf("Failed convolution! Normalized deviation %.6f between host and "
+               "device exceeds tolerance %.6f\n",
+               deviation, tolerance);
+        fflush(stdout);
+    }
+    else
+    {
+        printf("Successful convolution!\n");
+        fflush(stdout);
+    }
+
+    if (!diag_opts.quiet)
+    {
+        printf("Kernels execution time as seen by host: %llu us.\n",
+               (unsigned long long)(dev_time + 500) / 1000);
+
+        printf("Kernels execution time as measured by devices :\n");
+        printf("\t%llu us.\n",
+               (unsigned long long)(cl_util_get_event_duration(
+                                        events[0], CL_PROFILING_COMMAND_START,
+                                        CL_PROFILING_COMMAND_END, &error)
+                                    + 500)
+                   / 1000);
+        printf("\t%llu us.\n",
+               (unsigned long long)(cl_util_get_event_duration(
+                                        events[1], CL_PROFILING_COMMAND_START,
+                                        CL_PROFILING_COMMAND_END, &error)
+                                    + 500)
+                   / 1000);
+
+        printf("Reference execution as seen by host: %llu us.\n",
+               (unsigned long long)(host_time + 500) / 1000);
+        fflush(stdout);
+    }
+
+result:
+    free(concatenated_results);
+event2:
+    OCLERROR_RET(clReleaseEvent(events[1]), end_error, event1);
+event1:
+    OCLERROR_RET(clReleaseEvent(events[0]), end_error, subbufmask);
+subbufmask:
+    if(subdevice == 1)
+    {
+        OCLERROR_RET(clReleaseMemObject(sub_masks[1]), end_error, subbufmask0);
+    }
+subbufmask0:
+    OCLERROR_RET(clReleaseMemObject(sub_masks[0]), end_error, subbufout);
+subbufout:
+    if(subdevice == 1)
+    {
+        OCLERROR_RET(clReleaseMemObject(sub_output_grids[1]), end_error, subbufout0);
+    }
+subbufout0:
+    OCLERROR_PAR(clReleaseMemObject(sub_output_grids[0]), end_error, subbufin);
+subbufin:
+    if(subdevice == 1)
+    {
+        OCLERROR_RET(clReleaseMemObject(sub_input_grids[1]), end_error, subbufin0);
+    }
+subbufin0:
+    OCLERROR_RET(clReleaseMemObject(sub_input_grids[0]), end_error, subqueue);
+subqueue:
+    if(subdevice == 1)
+    {
+        OCLERROR_RET(clReleaseCommandQueue(sub_queues[1]), end_error, subqueue0);
+    }
+subqueue0:
+    OCLERROR_RET(clReleaseCommandQueue(sub_queues[1]), end_error, conv);
+conv:
+    if(subdevice == 1)
+    {
+        OCLERROR_RET(clReleaseKernel(convolutions[1]), end_error, conv0);
+    }
+conv0:
+    OCLERROR_RET(clReleaseKernel(convolutions[0]), end_error, bufmask);
+bufmask:
+    OCLERROR_RET(clReleaseMemObject(dev_mask), end_error, bufout);
+bufout:
+    OCLERROR_RET(clReleaseMemObject(dev_output_grid), end_error, bufin);
+bufin:
+    OCLERROR_RET(clReleaseMemObject(dev_input_grid), end_error, hmask);
+hmask:
+    free(h_mask);
+houtput:
+    free(h_output_grid);
+hinput:
+    free(h_input_grid);
+prg:
+    OCLERROR_RET(clReleaseProgram(program), end_error, subdevs);
+ker:
+    free(kernel);
+contx:
+    OCLERROR_RET(clReleaseContext(context), end_error, end);
+subdevs:
+    free(subdevices);
+dev:
+    OCLERROR_RET(clReleaseDevice(dev), end_error, end);
+end:
+    if (error) cl_util_print_error(error);
+    return error;
+}
diff --git a/samples/core/multi-device/main.cpp b/samples/core/multi-device/main.cpp
new file mode 100644
index 00000000..8cbefbbe
--- /dev/null
+++ b/samples/core/multi-device/main.cpp
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// OpenCL SDK includes.
+#include <CL/SDK/CLI.hpp>
+#include <CL/SDK/Context.hpp>
+#include <CL/SDK/Options.hpp>
+#include <CL/SDK/Random.hpp>
+
+// OpenCL Utils includes.
+#include <CL/Utils/Error.hpp>
+#include <CL/Utils/Event.hpp>
+#include <CL/Utils/Utils.hpp>
+
+// TCLAP includes.
+#include <tclap/CmdLine.h>
+
+// Standard header includes.
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <math.h>
+#include <memory>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <vector>
+
+// Sample-specific options.
+struct ConvolutionOptions
+{
+    size_t x_dim;
+    size_t y_dim;
+};
+
+// Add option to CLI-parsing SDK utility for input dimensions.
+template <> auto cl::sdk::parse<ConvolutionOptions>()
+{
+    return std::make_tuple(std::make_shared<TCLAP::ValueArg<size_t>>(
+                               "x", "x_dim", "x dimension of input", false,
+                               4'096, "positive integral"),
+                           std::make_shared<TCLAP::ValueArg<size_t>>(
+                               "y", "y_dim", "y dimension of input", false,
+                               4'096, "positive integral"));
+}
+template <>
+ConvolutionOptions cl::sdk::comprehend<ConvolutionOptions>(
+    std::shared_ptr<TCLAP::ValueArg<size_t>> x_dim_arg,
+    std::shared_ptr<TCLAP::ValueArg<size_t>> y_dim_arg)
+{
+    return ConvolutionOptions{ x_dim_arg->getValue(), y_dim_arg->getValue() };
+}
+
+// Host-side implementation of the convolution for verification. Padded input
+// assumed.
+void host_convolution(const std::vector<cl_float> in,
+                      std::vector<cl_float>& out,
+                      const std::vector<cl_float> mask, const size_t x_dim,
+                      const size_t y_dim)
+{
+    constexpr cl_uint mask_dim = 3;
+    constexpr cl_uint pad_width = mask_dim / 2;
+    const size_t pad_x_dim = x_dim + 2 * pad_width;
+    for (size_t x = 0; x < x_dim; ++x)
+    {
+        for (size_t y = 0; y < y_dim; ++y)
+        {
+            float result = 0.f;
+            for (size_t grid_column = x, mask_column = 0;
+                 mask_column < mask_dim; ++grid_column, ++mask_column)
+            {
+                for (size_t grid_row = y, mask_row = 0; mask_row < mask_dim;
+                     ++grid_row, ++mask_row)
+                {
+                    result += mask[mask_column + mask_row * mask_dim]
+                        * in[grid_column + grid_row * pad_x_dim];
+                }
+            }
+            out[x + y * x_dim] = result;
+        }
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        // Parse command-line options.
+        auto opts = cl::sdk::parse_cli<cl::sdk::options::Diagnostic,
+                                       cl::sdk::options::SingleDevice,
+                                       ConvolutionOptions>(argc, argv);
+        const auto& diag_opts = std::get<0>(opts);
+        const auto& dev_opts = std::get<1>(opts);
+        const auto& conv_opts = std::get<2>(opts);
+
+        // Create runtime objects based on user preference or default.
+        cl::Device dev = cl::sdk::get_context(dev_opts.triplet)
+                             .getInfo<CL_CONTEXT_DEVICES>()
+                             .at(0);
+
+        cl::Platform platform{
+            dev.getInfo<CL_DEVICE_PLATFORM>()
+        }; // https://github.com/KhronosGroup/OpenCL-CLHPP/issues/150
+
+        if (!diag_opts.quiet)
+        {
+            std::cout << "Selected device: " << dev.getInfo<CL_DEVICE_NAME>()
+                      << "\n"
+                      << "from " << platform.getInfo<CL_PLATFORM_VENDOR>()
+                      << " platform\n"
+                      << std::endl;
+        }
+
+        if (diag_opts.verbose)
+        {
+            std::cout << "Creating sub-devices...";
+            std::cout.flush();
+        }
+
+#if CL_HPP_TARGET_OPENCL_VERSION < 120
+        std::cerr
+            << "Error: OpenCL subdevices not supported before version 1.2 "
+            << std::endl;
+        exit(EXIT_FAILURE);
+#endif
+
+        // Create subdevices, each with half of the compute units available.
+        cl_uint max_compute_units = dev.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+        cl_device_partition_property subdevices_properties[] = {
+            (cl_device_partition_property)CL_DEVICE_PARTITION_EQUALLY,
+            (cl_device_partition_property)(max_compute_units / 2), 0
+        };
+        std::vector<cl::Device> subdevices{};
+        dev.createSubDevices(subdevices_properties, &subdevices);
+
+        if (subdevices.size() < 2)
+        {
+            std::cerr << "Error: OpenCL cannot create subdevices" << std::endl;
+            exit(EXIT_FAILURE);
+        }
+
+        cl::Context context(subdevices);
+
+        // Read kernel file.
+        const char* kernel_location = "./convolution.cl";
+        std::ifstream kernel_stream{ kernel_location };
+        if (!kernel_stream.is_open())
+            throw std::runtime_error{
+                std::string{ "Cannot open kernel source: " } + kernel_location
+            };
+
+        // Compile kernel.
+        if (diag_opts.verbose)
+        {
+            std::cout << "done.\nCompiling kernel...";
+            std::cout.flush();
+        }
+        cl::Program program(
+            context,
+            std::string{ std::istreambuf_iterator<char>{ kernel_stream },
+                         std::istreambuf_iterator<char>{} });
+
+        // Query device and runtime capabilities.
+        // If no -cl-std option is specified then the highest 1.x version
+        // supported by each device is used to compile the program. Therefore,
+        // it's only necessary to add the -cl-std option for 2.0 and 3.0 OpenCL
+        // versions.
+        cl::string compiler_options;
+        constexpr int max_major_version = 3;
+        for (auto i = 2; i <= max_major_version; ++i)
+        {
+            std::string version_str = std::to_string(i) + "."; // "i."
+            std::string compiler_opt_str =
+                "-cl-std=CL" + std::to_string(i) + ".0 "; // -cl-std=CLi.0
+
+            compiler_options += cl::string{ cl::util::opencl_c_version_contains(
+                                                dev, version_str)
+                                                ? compiler_opt_str
+                                                : "" };
+        }
+        program.build(subdevices, compiler_options.c_str());
+
+        // Initialize host-side storage.
+        constexpr cl_uint mask_dim = 3;
+        constexpr cl_uint pad_width = mask_dim / 2;
+        const size_t x_dim = conv_opts.x_dim;
+        const size_t y_dim = conv_opts.y_dim;
+        const size_t pad_x_dim = x_dim + 2 * pad_width;
+        const size_t pad_y_dim = y_dim + 2 * pad_width;
+
+        const size_t input_size = pad_x_dim * pad_y_dim;
+        const size_t output_size = x_dim * y_dim;
+        const size_t mask_size = mask_dim * mask_dim;
+
+        if (diag_opts.verbose)
+        {
+            std::cout << "done.\nInitializing host-side storage...";
+            std::cout.flush();
+        }
+
+        // Random number generator.
+        auto prng = [engine = std::default_random_engine{},
+                     dist = std::uniform_real_distribution<cl_float>{
+                         -1.0, 1.0 }]() mutable { return dist(engine); };
+
+        // Initialize input matrix. The input will be padded to remove
+        // conditional branches from the convolution kernel for determining
+        // out-of-bounds.
+        std::vector<cl_float> h_input_grid(input_size);
+        if (diag_opts.verbose)
+        {
+            std::cout << "\n  Generating " << output_size
+                      << " random numbers for convolution input grid...";
+            std::cout.flush();
+        }
+        cl::sdk::fill_with_random(prng, h_input_grid);
+
+        // Fill with 0s the extra rows and columns added for padding.
+        for (size_t y = 0; y < pad_y_dim; ++y)
+        {
+            for (size_t x = 0; x < pad_x_dim; ++x)
+            {
+                if (x == 0 || y == 0 || x == (pad_x_dim - 1)
+                    || y == (pad_y_dim - 1))
+                {
+                    h_input_grid[y * pad_x_dim + x] = 0;
+                }
+            }
+        }
+
+        // Declare output matrix. Output will not be padded.
+        std::vector<cl_float> h_output_grid(output_size);
+
+        // Initialize convolution mask.
+        std::vector<cl_float> h_mask(mask_size);
+        if (diag_opts.verbose)
+        {
+            std::cout << "done.  \nGenerating " << mask_size
+                      << " random numbers for convolution mask...";
+            std::cout.flush();
+        }
+        cl::sdk::fill_with_random(prng, h_mask);
+
+        // Create device buffers, from which we will create the subbuffers for
+        // the subdevices.
+        const size_t grid_midpoint = y_dim / 2;
+        const size_t pad_grid_midpoint = pad_y_dim / 2;
+
+        if (diag_opts.verbose)
+        {
+            std::cout << "done.\nInitializing device-side storage...";
+            std::cout.flush();
+        }
+
+        std::vector<cl_float> output_grid(output_size, 0);
+        cl::Buffer dev_input_grid(context,
+                                  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                  input_size * sizeof(cl_float), h_input_grid.data());
+        cl::Buffer dev_output_grid(context,
+                                   CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                   output_size * sizeof(cl_float), nullptr);
+        cl::Buffer dev_mask(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                            mask_size * sizeof(cl_float), h_mask.data());
+
+        if (diag_opts.verbose)
+        {
+            std::cout << "done.\nSetting up sub-devices...";
+            std::cout.flush();
+        }
+
+        // Set up subdevices for kernel execution.
+        const size_t half_input_size = pad_x_dim * (pad_grid_midpoint + 1);
+        const size_t input_offset = pad_x_dim * (pad_grid_midpoint - 1);
+        const size_t half_output_size = x_dim * grid_midpoint;
+
+        std::vector<cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer,
+                                      cl_ulong, cl_ulong>>
+            convolutions{};
+        std::vector<cl::CommandQueue> sub_queues{};
+        std::vector<cl::Buffer> sub_input_grids{}, sub_output_grids{},
+            sub_masks{};
+
+        for (size_t i = 0; i < subdevices.size(); ++i)
+        {
+            auto subdevice = subdevices[i];
+
+            if (diag_opts.verbose)
+            {
+                std::cout
+                    << "\n  Creating kernel and command queue of sub-device "
+                    << i << "...";
+                std::cout.flush();
+            }
+
+            auto convolution =
+                cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl_ulong,
+                                  cl_ulong>(program, "convolution_3x3");
+
+            cl::CommandQueue queue(context, subdevice,
+                                   cl::QueueProperties::Profiling);
+
+            // Initialize device-side storage.
+            // First device performs the convolution in the upper half and
+            // second device in the lower half (middle borders included).
+            if (diag_opts.verbose)
+            {
+                std::cout << "done.\n  Initializing device-side storage of "
+                             "sub-device "
+                          << i << "...";
+                std::cout.flush();
+            }
+
+            cl_buffer_region input_region = { i * input_offset * sizeof(cl_float),
+                                              half_input_size * sizeof(cl_float) },
+                             output_region = { i * half_output_size * sizeof(cl_float), half_output_size * sizeof(cl_float) },
+                             mask_region = { 0, mask_size * sizeof(cl_float) };
+
+            const cl_uint align =
+                subdevice.getInfo<CL_DEVICE_MEM_BASE_ADDR_ALIGN>();
+            if (input_region.origin % align || output_region.origin % align)
+            {
+                std::cerr << "Error: Memory should be aligned to "
+                          << subdevice.getInfo<CL_DEVICE_MEM_BASE_ADDR_ALIGN>()
+                          << std::endl;
+                exit(EXIT_FAILURE);
+            }
+
+            cl::Buffer sub_input_grid = dev_input_grid.createSubBuffer(
+                CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &input_region);
+            cl::Buffer sub_output_grid = dev_output_grid.createSubBuffer(
+                CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY,
+                CL_BUFFER_CREATE_TYPE_REGION, &output_region);
+            cl::Buffer sub_mask = dev_mask.createSubBuffer(
+                CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &mask_region);
+
+            if (diag_opts.verbose)
+            {
+                std::cout << "done.";
+                std::cout.flush();
+            }
+
+            sub_queues.push_back(queue);
+            convolutions.push_back(convolution);
+            sub_input_grids.push_back(sub_input_grid);
+            sub_output_grids.push_back(sub_output_grid);
+            sub_masks.push_back(sub_mask);
+        }
+
+        // Launch kernels.
+        if (diag_opts.verbose)
+        {
+            std::cout << "\nExecuting on device... ";
+            std::cout.flush();
+        }
+
+        // Initialize global and local buffers for device execution.
+        const cl::NDRange global{ x_dim, y_dim / 2 };
+
+        // Enqueue kernel calls and wait for them to finish.
+        std::vector<cl::Event> dev1_kernel_runs; dev1_kernel_runs.reserve(1);
+        std::vector<cl::Event> dev2_kernel_runs; dev2_kernel_runs.reserve(1);
+        auto dev_start = std::chrono::high_resolution_clock::now();
+
+        dev1_kernel_runs.push_back(convolutions[0](
+            cl::EnqueueArgs{ sub_queues[0], global },
+            sub_input_grids[0], sub_output_grids[0], sub_masks[0], x_dim,
+            grid_midpoint));
+        dev2_kernel_runs.push_back(convolutions[1](
+            cl::EnqueueArgs{ sub_queues[1], global },
+            sub_input_grids[1], sub_output_grids[1], sub_masks[1], x_dim,
+            grid_midpoint));
+
+        cl::WaitForEvents(dev1_kernel_runs);
+        cl::WaitForEvents(dev2_kernel_runs);
+
+        auto dev_end = std::chrono::high_resolution_clock::now();
+
+        // Compute reference host-side convolution.
+        if (diag_opts.verbose)
+        {
+            std::cout << " done.\nExecuting on host... ";
+            std::cout.flush();
+        }
+        auto host_start = std::chrono::high_resolution_clock::now();
+
+        host_convolution(h_input_grid, h_output_grid, h_mask, x_dim, y_dim);
+
+        auto host_end = std::chrono::high_resolution_clock::now();
+
+        if (diag_opts.verbose)
+        {
+            std::cout << "done." << std::endl;
+        }
+
+        // Fetch and combine results from devices.
+        std::vector<cl_float> concatenated_results(output_size);
+        cl::copy(sub_queues.front(), dev_output_grid, concatenated_results.begin(), concatenated_results.end());
+
+        // Validate device-side solution.
+        cl_float deviation = 0.f;
+        const cl_float tolerance = 1e-6;
+
+        for (size_t i = 0; i < concatenated_results.size(); ++i)
+        {
+            deviation += std::fabs(concatenated_results[i] - h_output_grid[i]);
+        }
+        deviation /= concatenated_results.size();
+
+        if (deviation > tolerance)
+        {
+            std::cerr << "Failed convolution! Normalized deviation "
+                      << deviation
+                      << " between host and device exceeds tolerance "
+                      << tolerance << std::endl;
+        }
+        else
+        {
+            std::cout << "Successful convolution!" << std::endl;
+        }
+
+        if (!diag_opts.quiet)
+        {
+            std::cout << "Kernels execution time as seen by host: "
+                      << std::chrono::duration_cast<std::chrono::microseconds>(
+                             dev_end - dev_start)
+                             .count()
+                      << " us." << std::endl;
+            std::cout << "Kernels execution time as measured by devices: "
+                      << std::endl;
+            for (auto& pass : dev1_kernel_runs)
+                std::cout << "  - "
+                          << cl::util::get_duration<CL_PROFILING_COMMAND_START,
+                                                    CL_PROFILING_COMMAND_END,
+                                                    std::chrono::microseconds>(
+                                 pass)
+                                 .count()
+                          << " us." << std::endl;
+            for (auto& pass : dev2_kernel_runs)
+                std::cout << "  - "
+                          <<
+                          cl::util::get_duration<CL_PROFILING_COMMAND_START,
+                                                    CL_PROFILING_COMMAND_END,
+                                                    std::chrono::microseconds>(
+                                 pass)
+                                 .count()
+                          << " us." << std::endl;
+            std::cout << "Reference execution as seen by host: "
+                      << std::chrono::duration_cast<std::chrono::microseconds>(
+                             host_end - host_start)
+                             .count()
+                      << " us." << std::endl;
+        }
+    } catch (cl::BuildError& e)
+    {
+        std::cerr << "OpenCL build error: " << e.what() << std::endl;
+        for (auto& build_log : e.getBuildLog())
+        {
+            std::cerr << "\tBuild log for device: "
+                      << build_log.first.getInfo<CL_DEVICE_NAME>() << "\n"
+                      << std::endl;
+            std::cerr << build_log.second << "\n" << std::endl;
+        }
+        std::exit(e.err());
+    } catch (cl::util::Error& e)
+    {
+        std::cerr << "OpenCL utils error: " << e.what() << std::endl;
+        std::exit(e.err());
+    } catch (cl::Error& e)
+    {
+        std::cerr << "OpenCL runtime error: " << e.what() << std::endl;
+        std::exit(e.err());
+    } catch (std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+
+    return 0;
+}