Skip to content

Commit cc5e561

Browse files
authored
Implemented Callback sample (KhronosGroup#87)
* Implemented callback sample * Minor fixes from code review * Minor fixes from code review II.
1 parent de7cb8a commit cc5e561

File tree

10 files changed

+1301
-4
lines changed

10 files changed

+1301
-4
lines changed

lib/include/CL/Utils/Context.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,7 @@ namespace util {
1313
Context UTILSCPP_EXPORT get_context(cl_uint plat_id, cl_uint dev_id,
1414
cl_device_type type,
1515
cl_int* error = nullptr);
16+
17+
void UTILSCPP_EXPORT print_device_info(const cl::Device& device);
1618
}
1719
}

lib/src/Utils/Context.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
// OpenCL SDK includes
22
#include <CL/Utils/Context.hpp>
33

4+
#include <iostream>
5+
#include <string>
6+
47
cl::Context cl::util::get_context(cl_uint plat_id, cl_uint dev_id,
58
cl_device_type type, cl_int* error)
69
{
@@ -40,3 +43,16 @@ cl::Context cl::util::get_context(cl_uint plat_id, cl_uint dev_id,
4043

4144
return cl::Context{};
4245
}
46+
47+
void cl::util::print_device_info(const cl::Device& device)
48+
{
49+
const cl::Platform platform(device.getInfo<CL_DEVICE_PLATFORM>());
50+
const std::string platform_vendor = platform.getInfo<CL_PLATFORM_VENDOR>();
51+
const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
52+
const std::string device_opencl_c_version =
53+
device.getInfo<CL_DEVICE_OPENCL_C_VERSION>();
54+
std::cout << "Selected platform by " << platform_vendor
55+
<< "\nSelected device: " << device_name << '\n'
56+
<< device_opencl_c_version << '\n'
57+
<< std::endl;
58+
}

samples/core/CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
add_subdirectory(enumopencl)
15+
add_subdirectory(binaries)
16+
add_subdirectory(blur)
17+
add_subdirectory(callback)
1618
add_subdirectory(copybuffer)
1719
add_subdirectory(copybufferkernel)
18-
add_subdirectory(saxpy)
20+
add_subdirectory(enumopencl)
1921
add_subdirectory(reduce)
20-
add_subdirectory(blur)
21-
add_subdirectory(binaries)
22+
add_subdirectory(saxpy)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Copyright (c) 2023 The Khronos Group Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
find_package(Threads)
16+
17+
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/threads.c"
18+
"#include <threads.h>
19+
#include <time.h>
20+
int main(void) { thrd_sleep(&(struct timespec){.tv_nsec=1}, NULL); }
21+
")
22+
23+
# Signature can be modernized at CMake version 3.25
24+
try_compile(
25+
HAS_C11_THREADS
26+
"${CMAKE_CURRENT_BINARY_DIR}"
27+
SOURCES "${CMAKE_CURRENT_BINARY_DIR}/threads.c"
28+
C_STANDARD 11
29+
C_STANDARD_REQUIRED ON
30+
)
31+
32+
if (HAS_C11_THREADS)
33+
add_sample(
34+
TEST
35+
TARGET callback
36+
VERSION 300
37+
SOURCES main.c
38+
KERNELS reaction_diffusion.cl)
39+
target_link_libraries(callback PRIVATE
40+
$<TARGET_NAME_IF_EXISTS:Threads::Threads>)
41+
else()
42+
message(WARNING
43+
"Skipping callback sample, C11 standard threads are not supported with the current toolset")
44+
endif()
45+
46+
add_sample(
47+
TEST
48+
TARGET callbackcpp
49+
VERSION 300
50+
SOURCES main.cpp
51+
KERNELS reaction_diffusion.cl)
52+
target_link_libraries(callbackcpp PRIVATE
53+
$<TARGET_NAME_IF_EXISTS:Threads::Threads>)

samples/core/callback/README.md

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# Callback sample - synchronizing Command Queues with Events
2+
3+
## Sample purpose
4+
5+
This sample demonstrates how to synchronize the execution of multiple command queues with each other and with host-side calculations.
6+
7+
## Key APIs and Concepts
8+
9+
If all operations are enqueued on a single command queue, the execution of the operations is sequential, and the ordering of the operations matches the order in which they were enqueued (unless `CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE` was specified).
10+
11+
However, sometimes it is more performant to enqueue unrelated operations to separate command queues, enabling the parallel execution capabilities of the device. The primer utility of synchronization across command queues are **OpenCL event objects**. Each enqueue operation can take a list of events which it will synchronize with. Also, each enqueue operation can emit a single event object that can be used to synchronize other operations with that one. Additionally, using the **callback functionality**, host code can be executed at various points in the event's lifetime.
12+
13+
### Kernel logic
14+
15+
The sample implements the simulation of a theoretical chemical reaction, namely the [Gray-Scott model](https://groups.csail.mit.edu/mac/projects/amorphous/GrayScott/) of the [reaction-diffusion system](https://en.wikipedia.org/wiki/Reaction%E2%80%93diffusion_system). Explaining the algorithm is not the goal of this example. We only need to know, that the simulation progresses iteratively: given a 2D lattice of the concentration of the $U$ and $V$ chemicals (stored in the R and G channels of an image texture respectively), the subsequent state is calculated by the kernel. The algorithm results in visually interesting shapes, with the order being obvious to the viewer: the more "spread" chemical $V$ is, the later iteration step we observe.
16+
17+
## Application flow
18+
19+
### Overview
20+
21+
From an initial state, the iteration progresses through a fixed number of steps. The number of steps can be specified on the command line. Two 2D image objects are used to store the state of the simulation, one for the input and another for the output state, and the images are swapped after each step. At every Nth iteration (N being controlled from the command line), the state of the simulation is dispatched for writing to the disk as a PNG image.
22+
23+
### Command queues and synchronization
24+
25+
<p align="center">
26+
<img alt="Sequence diagram of the sample's execution flow" src="callback_sample_sequence.svg"/>
27+
</p>
28+
29+
In this example, kernel launches, image-to-buffer copies (device-to-device copy) and buffer reads (device-to-host copy) are performed. Some systems allow the overlapping of these operations, thereby it makes sense to enqueue all three of these operation types to separate command queues. The journey of each iteration state is the following:
30+
31+
1. The state is calculated from the previous state by the `reaction_diffusion_step` kernel. The kernel launch synchronizes with the previous image-to-buffer copy, which could have happened in the previous iteration, but also quite a few iterations ago.
32+
2. If the iteration index is a multiple of N, a device buffer object and a host vector of the same size are allocated. Otherwise, the next iteration starts calculating at step 1.
33+
3. A copy of the output image to the allocated buffer object is enqueued. Eventually the simulation state is read to host memory, but since device-to-device copy is usually faster than device-to-host copy, first the previous output image is copied to a buffer and this buffer is read to the host in step 4. Note, that the copy can be performed concurrently with the kernel launch of the next iteration, since they both read from the same image object. Only the subsequent iteration's compute launch has to synchronize with this copy.
34+
4. The read of the buffer (i.e. device-to-host copy) is enqueued on the read queue. If the device has concurrent copy capabilities, this read potentially overlaps with a previous copy (step 3.) operation.
35+
5. After the read of the buffer is completed, its contents need to be written to an image file. This has to synchronize with the read operation, but this time, we need to execute host code instead of an OpenCL enqueue. For that, we set the completion callback of the event produced by the read enqueue. A `void*` argument is passed to the callback, which is used to identify the host vector containing the data.
36+
6. The callback is executed on a thread used by the OpenCL runtime. Therefore it is advised that the callback returns as quickly as possible. To achieve this, the image write is dispatched to a different thread, using `std::async` in the C++ version of the sample, and `thrd_create` in the C version.
37+
7. When the image write has finished, the completion is signaled back to the waiting main thread, otherwise the executable would possibly exit before completion. For this purpose `std::future::wait` is used in the C++ version, and a conditional variable in the C version.
38+
39+
## Used API surface (C++)
40+
41+
```c++
42+
cl::Buffer::Buffer(cl::Context, cl_mem_flags, std::size_t size)
43+
cl::CommandQueue::enqueueCopyImageToBuffer(cl::Image2D, cl::Buffer, std::array<size_type, 3>,
44+
std::array<size_type, 3>, std::size_t,
45+
std::vector<cl::Event>*, cl::Event*)
46+
cl::CommandQueue::enqueueFillImage(cl::Image2D, cl_float4, std::array<size_type, 3>, std::array<size_type, 3>)
47+
cl::CommandQueue::enqueueReadBuffer(cl::Buffer, bool, std::size_t, std::size_t,
48+
void*, std::vector<cl::Event>*, cl::Event*)
49+
cl::Context::getInfo<CL_CONTEXT_DEVICES>()
50+
cl::Context::getSupportedImageFormats(cl_mem_flags, cl_mem_object_type, std::vector<cl::ImageFormat>*)
51+
cl::Device::getInfo<CL_DEVICE_NAME>()
52+
cl::Device::getInfo<CL_DEVICE_PLATFORM>()
53+
cl::EnqueueArgs::EnqueueArgs(cl::CommandQueue, cl::Event, cl::NDRange)
54+
cl::Event::Event()
55+
cl::Event::Event(cl::Event)
56+
cl::Event::setCallback(cl_int, void(*)(cl_event, cl_int, void*), void*)
57+
cl::Image2D::Image2D(cl::Context, cl_mem_flags, cl::ImageFormat, std::size_t, std::size_t)
58+
cl::ImageFormat::ImageFormat(cl_channel_order, cl_channel_type)
59+
cl::KernelFunctor<cl::Image2D, cl::Image2D>(cl::Program, std::string)
60+
cl::NDRange(std::size_t, std::size_t)
61+
cl::Platform::getInfo<CL_PLATFORM_VENDOR>()
62+
cl::Platform::Platform(cl_platform)
63+
cl::Program::build(cl::Device)
64+
cl::Program::Program(cl::Context, std::string)
65+
cl::sdk::comprehend()
66+
cl::sdk::parse()
67+
cl::sdk::parse_cli()
68+
cl::UserEvent::setStatus(cl_int)
69+
cl::UserEvent::UserEvent(cl::Context)
70+
```
71+
72+
## Used API surface (C)
73+
74+
```c
75+
clCreateBuffer
76+
clCreateCommandQueueWithProperties
77+
clCreateContext
78+
clCreateImage
79+
clCreateKernel
80+
clCreateProgramWithSource
81+
clEnqueueCopyImageToBuffer
82+
clEnqueueFillImage
83+
clEnqueueNDRangeKernel
84+
clEnqueueReadBuffer
85+
clGetDeviceInfo
86+
clGetSupportedImageFormats
87+
clReleaseCommandQueue
88+
clReleaseContext
89+
clReleaseDevice
90+
clReleaseEvent
91+
clReleaseEvent
92+
clReleaseKernel
93+
clReleaseMemObject
94+
clReleaseProgram
95+
clSetEventCallback
96+
clSetKernelArg
97+
cnd_destroy
98+
cnd_signal
99+
mtx_destroy
100+
mtx_lock
101+
mtx_unlock
102+
thrd_create
103+
thrd_detach
104+
```

0 commit comments

Comments
 (0)