intel · gmlueck · Jun 27, 2023 · Jul 27, 2023 · Oct 8, 2024 · Oct 8, 2024
@@ -1,74 +1,99 @@
-= SYCL_INTEL_dot_accumulate
+= sycl_ext_oneapi_dot_accumulate
+
 :source-highlighter: coderay
 :coderay-linenums-mode: table
+
+// This section needs to be after the document title.
 :doctype: book
+:toc2:
+:toc: left
 :encoding: utf-8
 :lang: en
+:dpcpp: pass:[DPC++]
 
-:blank: pass:[ +]
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
 
-// Set the default source code type in this document to C,
-// for syntax highlighting purposes.
-:language: c
 
-// This is what is needed for C++, since docbook uses c++
-// and everything else uses cpp.  This doesn't work when
-// source blocks are in table cells, though, so don't use
-// C++ unless it is required.
-//:language: {basebackend@docbook:c++:cpp}
+== Notice
 
-== Introduction
+[%hardbreaks]
+Copyright (C) 2020 Intel Corporation.  All rights reserved.
 
-IMPORTANT: This specification is a draft.
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
 
-NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by permission by Khronos.
 
-NOTE: This document is better viewed when rendered as html with asciidoctor.  GitHub does not render image icons.
+== Contact
 
-== Name Strings
+To report problems with this extension, please open a new issue at:
 
-`SYCL_ONEAPI_dot_accumulate`
+https://github.com/intel/llvm/issues
 
-This is a placeholder name.
 
-== Notice
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 9 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
 
-Copyright (c) 2020 Intel Corporation.  All rights reserved.
 
 == Status
 
-Working Draft
+This extension is implemented and fully supported by {dpcpp}.
 
-This is a preview extension specification, intended to provide early access to a feature for review and community feedback. When the feature matures, this specification may be released as a formal extension.
 
-Because the interfaces defined by this specification are not final and are subject to change they are not intended to be used by shipping software products.
+== Overview
 
-== Version
+This extension adds new SYCL built-in functions that may simplify development
+and provide access specialized hardware instructions when a SYCL kernel needs
+to perform a dot product of two vectors followed by a scalar accumulation.
 
-Built On: {docdate} +
-Revision: 3
 
-== Contact
+== Specification
 
-Ben Ashbaugh, Intel (ben 'dot' ashbaugh 'at' intel 'dot' com)
+=== Feature test macro
 
-== Dependencies
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_DOT_ACCUMULATE` to one of the values defined in the table
+below.  Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
 
-This extension is written against the SYCL 1.2.1 specification, Revision v1.2.1-6.
+[%header,cols="1,5"]
+|===
+|Value
+|Description
 
-== Overview
+|1
+|Initial version of this extension.
+|===
 
-This extension adds new SYCL built-in functions that may simplify development and provide access specialized hardware instructions when a SYCL kernel needs to perform a dot product of two vectors followed by a scalar accumulation.
+=== New functions to compute the dot product of vectors
 
-== Enabling the extension
+This extension adds the following free functions:
 
-The extension is always enabled. The dot product functionality may be emulated in software or executed using hardware when suitable instructions are available.
+[source,c++]
+----
+namespace sycl::ext::oneapi {
 
-== Modifications of SYCL 1.2.1 specification
+int32_t dot_acc(vec<int8_t,4>  a, vec<int8_t,4>  b, int32_t c);
+int32_t dot_acc(vec<int8_t,4>  a, vec<uint8_t,4> b, int32_t c);
+int32_t dot_acc(vec<uint8_t,4> a, vec<int8_t,4>  b, int32_t c);
+uint32_t dot_acc(vec<uint8_t,4> a, vec<uint8_t,4> b, uint32_t c);
 
-=== Add to Section 4.13.6 - Geometric Functions
+int32_t dot_acc_4x8packed_ss(uint32_t a, uint32_t b, int32_t c);
+int32_t dot_acc_4x8packed_su(uint32_t a, uint32_t b, int32_t c);
+int32_t dot_acc_4x8packed_us(uint32_t a, uint32_t b, int32_t c);
+uint32_t dot_acc_4x8packed_uu(uint32_t a, uint32_t b, uint32_t c);
 
-Additionally, the following additional functions are available in the namespace `sycl::intel` on the host and device.
+} // namespace sycl::ext::oneapi
+----
 
 [cols="4a,4",options="header"]
 |====
@@ -86,65 +111,136 @@ int32_t dot_acc(vec<int8_t,4>  a,
 int32_t dot_acc(vec<uint8_t,4> a,
                 vec<int8_t,4>  b,
                 int32_t c)
-int32_t dot_acc(vec<uint8_t,4> a,
-                vec<uint8_t,4> b,
-                int32_t c)
+uint32_t dot_acc(vec<uint8_t,4> a,
+                 vec<uint8_t,4> b,
+                 uint32_t c)
 ----
 
-|Performs a four-component integer dot product accumulate operation. +
-{blank}
-The value that is returned is equivalent to +
-{blank}
-*dot*(_a_, _b_) + _c_
+|Performs a four-component integer dot product accumulate operation.  The value
+that is returned is equivalent to `dot(a, b) + c`, where `dot` computes the
+dot product of two vectors.
+
+|[source,c]
+----
+int32_t dot_acc_4x8packed_ss(uint32_t a,
+                             uint32_t b,
+                             int32_t c)
+----
+
+|Performs a four-component integer dot product accumulate operation, where
+`a` and `b` are both interpreted as `vec<int8_t,4>`.
+
+|[source,c]
+----
+int32_t dot_acc_4x8packed_su(uint32_t a,
+                             uint32_t b,
+                             int32_t c)
+----
+
+|Performs a four-component integer dot product accumulate operation, where
+`a` is interpreted as `vec<int8_t,4>` and `b` is interpreted as
+`vec<uint8_t,4>`.
+
+|[source,c]
+----
+int32_t dot_acc_4x8packed_us(uint32_t a,
+                             uint32_t b,
+                             int32_t c)
+----
+
+|Performs a four-component integer dot product accumulate operation, where
+`a` is interpreted as `vec<uint8_t,4>` and `b` is interpreted as
+`vec<int8_t,4>`.
+
+|[source,c]
+----
+uint32_t dot_acc_4x8packed_uu(uint32_t a,
+                              uint32_t b,
+                              uint32_t c);
+----
 
+|Performs a four-component integer dot product accumulate operation, where
+`a` and `b` are both interpreted as `vec<uint8_t,4>`.
 |====
 
-== Sample Header
+For all the "packed" overloads, the least significant byte of the integer is
+element 0, and the most significant byte is element 3.
+
+=== Deprecated functions
+
+The following functions are deprecated.
 
 [source,c++]
 ----
-namespace cl {
-namespace sycl {
-namespace ext {
-namespace oneapi {
+namespace sycl::ext::oneapi {
 
-int32_t dot_acc(vec<int8_t,4>  a, vec<int8_t,4>  b, int32_t c);
-int32_t dot_acc(vec<int8_t,4>  a, vec<uint8_t,4> b, int32_t c);
-int32_t dot_acc(vec<uint8_t,4> a, vec<int8_t,4>  b, int32_t c);
 int32_t dot_acc(vec<uint8_t,4> a, vec<uint8_t,4> b, int32_t c);
 
 int32_t dot_acc(int32_t a, int32_t b, int32_t c);
 int32_t dot_acc(int32_t a, uint32_t b, int32_t c);
 int32_t dot_acc(uint32_t a, int32_t b, int32_t c);
 int32_t dot_acc(uint32_t a, uint32_t b, int32_t c);
 
-} // oneapi
-} // ext
-} // sycl
-} // cl
+} // namespace sycl::ext::oneapi
+----
+
+[cols="4a,4",options="header"]
+|====
+| *Function*
+| *Description*
+
+|[source,c]
+----
+int32_t dot_acc(vec<uint8_t,4> a,
+                vec<uint8_t,4> b,
+                int32_t c)
+----
+
+|Performs a four-component integer dot product accumulate operation, where the
+elements of `a` and `b` are unsigned while `c` is signed.  Use the version
+taking an unsigned `c` instead.
+
+|[source,c]
+----
+int32_t dot_acc(int32_t a,
+                int32_t b,
+                int32_t c)
 ----
 
-== Issues
+|Performs a four-component integer dot product accumulate operation, where
+`a` and `b` are both interpreted as `vec<int8_t,4>`.  Use
+`dot_acc_4x8packed_ss` instead.
 
-None.
+|[source,c]
+----
+int32_t dot_acc(int32_t a,
+                uint32_t b,
+                int32_t c)
+----
 
-== Revision History
+|Performs a four-component integer dot product accumulate operation, where
+`a` is interpreted as `vec<int8_t,4>` and `b` is interpreted as
+`vec<uint8_t,4>`.  Use `dot_acc_4x8packed_su` instead.
 
-[cols="5,15,15,70"]
-[grid="rows"]
-[options="header"]
-|========================================
-|Rev|Date|Author|Changes
-|1|2019-12-13|Ben Ashbaugh|*Initial draft*
-|2|2019-12-18|Ben Ashbaugh|Switched to standard C++ fixed width types.
-|3|2020-10-26|Rajiv Deodhar|Added int32 types.
-|========================================
+|[source,c]
+----
+int32_t dot_acc(uint32_t a,
+                int32_t b,
+                int32_t c)
+----
+
+|Performs a four-component integer dot product accumulate operation, where
+`a` is interpreted as `vec<uint8_t,4>` and `b` is interpreted as
+`vec<int8_t,4>`.  Use `dot_acc_4x8packed_us` instead.
 
-//************************************************************************
-//Other formatting suggestions:
-//
-//* Use *bold* text for host APIs, or [source] syntax highlighting.
-//* Use `mono` text for device APIs, or [source] syntax highlighting.
-//* Use `mono` text for extension names, types, or enum values.
-//* Use _italics_ for parameters.
-//************************************************************************
+|[source,c]
+----
+int32_t dot_acc(uint32_t a,
+                uint32_t b,
+                int32_t c)
+----
+
+|Performs a four-component integer dot product accumulate operation, where
+`a` and `b` are both interpreted as `vec<uint8_t,4>`.  Use
+`dot_acc_4x8packed_uu` instead.
+|====