[SYCL][COMPAT][cuda] Add "ptr_to_integer" syclcompat functions. (#14283)

JackAKirk · Alcpz · joeatodd · web-flow · commit 3ba29f386e30 · 2024-10-10T09:03:58.000+01:00
Add "ptr_to_integer" (generic address space to .shared) syclcompat
functions.

These functions are commonly required in optimized libraries that use
inline ptx. The standard naming convention of removing "__" from
corresponding cuda builtins has been applied. See the readme and
accompanying test-e2e for example usage.

---------

Signed-off-by: JackAKirk &lt;jack.kirk@codeplay.com&gt;
Co-authored-by: Alberto Cabrera Pérez &lt;alberto.cabrera@intel.com&gt;
Co-authored-by: Joe Todd &lt;joe.todd@codeplay.com&gt;
diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
@@ -855,6 +855,41 @@ public:
 } // syclcompat
 ```
 
+### ptr_to_int
+
+The following cuda backend specific function is introduced in order to
+translate from local memory pointers to `uint32_t` or `size_t` variables that
+contain a byte address to the local (local refers to`.shared` in nvptx) memory
+state space.
+
+``` c++
+namespace syclcompat {
+template <typename T>
+__syclcompat_inline__
+    std::enable_if_t<std::is_same_v<T, uint32_t> || std::is_same_v<T, size_t>,
+                     T>
+    ptr_to_int(void *ptr)
+} // namespace syclcompat
+```
+
+These variables can be used in inline PTX instructions that take address
+operands. Such inline PTX instructions are commonly used in optimized
+libraries. A simplified example usage of the above functions is as follows:
+
+``` c++
+  half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
+  // ...
+  // ...
+  T addr =
+      syclcompat::ptr_to_int<T>(reinterpret_cast<char *>(data) + (id % 8) * 16);
+  uint32_t fragment;
+#if defined(__NVPTX__)
+  asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
+               : "=r"(fragment)
+               : "r"(addr));
+#endif
+```
+
 ### Device Information
 
 `sycl::device` properties are encapsulated using the `device_info` helper class.
diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp
@@ -53,6 +53,7 @@
 
 #include <syclcompat/device.hpp>
 #include <syclcompat/traits.hpp>
+#include <syclcompat/defs.hpp>
 
 #if defined(__linux__)
 #include <sys/mman.h>
@@ -86,6 +87,23 @@ enum memcpy_direction {
 };
 }
 
+template <typename T>
+__syclcompat_inline__
+    std::enable_if_t<std::is_same_v<T, uint32_t> || std::is_same_v<T, size_t>,
+                     T>
+    ptr_to_int(void *ptr) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, uint32_t>) {
+    return (intptr_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
+  } else {
+    return (size_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
+  }
+#else
+  throw sycl::exception(make_error_code(sycl::errc::runtime),
+                        "ptr_to_int is only supported on Nvidia devices.");
+#endif
+}
+
 enum class memory_region {
   global = 0, // device global memory
   constant,   // device read-only memory
diff --git a/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp b/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp
@@ -0,0 +1,63 @@
+// REQUIRES: cuda
+// RUN:  %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out
+// RUN:  %{run} %t.out
+#include <sycl/detail/core.hpp>
+#include <sycl/group_barrier.hpp>
+#include <syclcompat/memory.hpp>
+
+using namespace sycl;
+#define NUM_ELEMENTS 64
+
+template <class T> void test(queue stream) {
+  half *res = malloc_shared<half>(NUM_ELEMENTS, stream);
+
+  for (int i = 0; i < NUM_ELEMENTS; ++i) {
+    res[i] = 0.5;
+  }
+
+  sycl::nd_range<1> global_range{sycl::range{32}, sycl::range{32}};
+
+  stream
+      .submit([&](handler &h) {
+        h.parallel_for<T>(global_range, [=](nd_item<1> item) {
+          sycl::group work_group = item.get_group();
+          int id = item.get_global_linear_id();
+          half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
+
+          data[id * 2] = id;
+          data[id * 2 + 1] = id + 0.5;
+
+          T addr =
+              syclcompat::ptr_to_int<T>(reinterpret_cast<char *>(data) + (id % 8) * 16);
+
+          uint32_t fragment;
+#if defined(__NVPTX__)
+          asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
+                       : "=r"(fragment)
+                       : "r"(addr));
+#endif
+          sycl::group_barrier(work_group);
+
+          half *data_ptr = reinterpret_cast<half *>(&fragment);
+          res[id * 2] = data_ptr[0];
+          res[id * 2 + 1] = data_ptr[1];
+        });
+      })
+      .wait();
+
+  for (int i = 0; i < NUM_ELEMENTS; i++) {
+    assert(res[i] == static_cast<half>(i / 2.0));
+  }
+
+  free(res, stream);
+};
+
+int main() {
+
+  queue stream{property::queue::in_order{}};
+  test<size_t>(stream);
+  test<uint32_t>(stream);
+
+  std::cout << "PASS" << std::endl;
+  return 0;
+}