Adding a payload version to become compatible with extra output tensors in 2.2.1

jirioc · jirioc · commit 31306d116804 · 2025-11-25T15:51:43.000+01:00
diff --git a/backends/nxp/neutron_node_extraction.py b/backends/nxp/neutron_node_extraction.py
@@ -21,6 +21,7 @@ class NeutronNodeArtifacts:
     microcode: np.ndarray
     weights: np.ndarray
     kernels: np.ndarray
+    payload_version: int
 
 
 def extract_artifacts_from_neutron_node(
@@ -123,7 +124,12 @@ def extract_artifacts_from_neutron_node(
     output_names = []
     output_indices = []
     graph_outputs = sub_graph.OutputsAsNumpy()
+    payload_version = 0
+    # Ignore the extra outputs: scratch and eventually also profile and debug
     node_outputs = neutron_node.OutputsAsNumpy()[:-1]
+    if len(graph_outputs) == len(node_outputs) - 2:
+        payload_version = 1
+        node_outputs = node_outputs[:-2]
     for tensor_idx in node_outputs:
         which_graph_output = np.where(graph_outputs == tensor_idx)[0]
         assert (
@@ -142,4 +148,5 @@ def extract_artifacts_from_neutron_node(
         microcode,
         weights,
         kernels,
+        payload_version,
     )
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
@@ -283,6 +283,8 @@ def _create_payload_header(
         +----------------------------------------+------------------------------------------+
         | 1st output map (1B)                    | [nth* output map (1B)]                   |
         +----------------------------------------+------------------------------------------+
+        | Payload version (1B)                                                              |
+        +-----------------------------------------------------------------------------------+
 
         :param io_formats: IO tensors formats.
         :return: Bytes representation of payload header.
@@ -325,6 +327,7 @@ def _create_payload_header(
 
         header_data.extend(neutron_artifacts.input_indices)
         header_data.extend(neutron_artifacts.output_indices)
+        header_data.append(neutron_artifacts.payload_version)
 
         # noinspection PyTypeChecker
         return np.array(header_data, dtype=np.uint8)
diff --git a/backends/nxp/runtime/NeutronBackend.cpp b/backends/nxp/runtime/NeutronBackend.cpp
@@ -38,6 +38,8 @@ namespace neutron {
      +----------------------------------------+------------------------------------------+
      | 1st output map (1B)                    | [nth* output map (1B)]                   |
      +----------------------------------------+------------------------------------------+
+     | Payload version (1B)                                                              |
+     +-----------------------------------------------------------------------------------+
 */
 // clang-format on
 #define ITEM_SIZE 1 // 1 Byte
@@ -53,10 +55,13 @@ namespace neutron {
 #define OUTPUT_TENSOR_MAP_ARRAY_ADDR(base)                        \
   (base + 3 * ITEM_SIZE + 2 * base[INPUT_TENSOR_FORMAT_LEN_POS] + \
    1 * base[OUTPUT_TENSOR_FORMAT_LEN_POS])
+#define PAYLOAD_VERSION_ADDR(base)                                \
+  (base + 3 * ITEM_SIZE + 2 * base[INPUT_TENSOR_FORMAT_LEN_POS] + \
+   2 * base[OUTPUT_TENSOR_FORMAT_LEN_POS])
 #define PAYLOAD_ADDR(base)                                     \
   (base +                                                      \
    ALIGN_SIZE(                                                 \
-       3 * ITEM_SIZE + 2 * base[INPUT_TENSOR_FORMAT_LEN_POS] + \
+       4 * ITEM_SIZE + 2 * base[INPUT_TENSOR_FORMAT_LEN_POS] + \
        2 * base[OUTPUT_TENSOR_FORMAT_LEN_POS]))
 
 // Aggregate neutron model handle and data structures into one.
@@ -65,6 +70,8 @@ typedef struct {
   int numOutputs = 0;
   int numInputArgs = 0;
   uint32_t scratchSize = 0;
+  uint32_t profileSize = 0;
+  uint32_t debugSize = 0;
   NeutronModelConfig mcfg;
   NeutronDataConfig dcfg;
   NeutronModelHandle nmh = NULL;
@@ -269,6 +276,7 @@ class NeutronBackend final : public PyTorchBackendInterface {
         OUTPUT_TENSOR_FORMAT_ARRAY_ADDR(payloadFlags);
     cfg->inputMap = INPUT_TENSOR_MAP_ARRAY_ADDR(payloadFlags);
     cfg->outputMap = OUTPUT_TENSOR_MAP_ARRAY_ADDR(payloadFlags);
+    uint8_t payloadVersion = *PAYLOAD_VERSION_ADDR(payloadFlags);
 
     const uint32_t* buffer = static_cast<const uint32_t*>(
         static_cast<const void*> PAYLOAD_ADDR(payloadFlags));
@@ -282,9 +290,28 @@ class NeutronBackend final : public PyTorchBackendInterface {
     }
     uint32_t microcodeSize = buffer[6];
     uint32_t weightsSize = buffer[7];
-    cfg->scratchSize = buffer[9];
-    cfg->numInputs = buffer[11];
-    cfg->numOutputs = buffer[12];
+    switch (payloadVersion) {
+      case 0:
+        cfg->scratchSize = buffer[9];
+        cfg->profileSize = 0;
+        cfg->debugSize = 0;
+        cfg->numInputs = buffer[11];
+        cfg->numOutputs = buffer[12];
+        break;
+      case 1:
+        cfg->scratchSize = buffer[9];
+        cfg->profileSize = buffer[10];
+        cfg->debugSize = buffer[11];
+        cfg->numInputs = buffer[13];
+        cfg->numOutputs = buffer[14];
+        break;
+      default:
+        ET_LOG(
+            Error,
+            "Unknown payload version %d. Please update the backend",
+            payloadVersion);
+        return Error::InvalidProgram;
+    }
     if (cfg->numInputs != numInputs) {
       ET_LOG(
           Error,
@@ -336,10 +363,16 @@ class NeutronBackend final : public PyTorchBackendInterface {
     // Allocate place for input and output pointers.
     cfg->dcfg.inputs = static_cast<const void**>(
         context.allocate(cfg->numInputs * sizeof(void*)));
-    cfg->dcfg.outputs =
-        static_cast<void**>(context.allocate(cfg->numOutputs * sizeof(void*)));
+    // There are 3 extra entries: scratch, profile and debug. The scratch
+    // pointer was allocated implicitly in the previous versions.
+    cfg->dcfg.outputs = static_cast<void**>(
+        context.allocate((cfg->numOutputs + 3) * sizeof(void*)));
     cfg->dcfg.outputs[cfg->numOutputs] =
         static_cast<void*>(context.allocate(cfg->scratchSize, 16));
+    cfg->dcfg.outputs[cfg->numOutputs + 1] =
+        static_cast<void*>(context.allocate(cfg->profileSize, 16));
+    cfg->dcfg.outputs[cfg->numOutputs + 2] =
+        static_cast<void*>(context.allocate(cfg->debugSize, 16));
 
     // Set inputs from args.
     // Transpose inputs if needed.
@@ -352,7 +385,7 @@ class NeutronBackend final : public PyTorchBackendInterface {
           return Error::InvalidProgram;
         }
         // Allocate buffer, the allocator is reset after each PTE instruction.
-        void* buffer = context.allocate(arg.nbytes());
+        void* buffer = context.allocate(arg.nbytes(), 16);
         transposeInput(
             arg.const_data_ptr(), buffer, arg.sizes(), arg.element_size());
         cfg->dcfg.inputs[i] = buffer;
@@ -368,7 +401,7 @@ class NeutronBackend final : public PyTorchBackendInterface {
       if (cfg->outputTranspositionFlags[i] &&
           multipleChannelsPresent(arg.sizes())) {
         // Allocate buffer, the allocator is reset after each PTE instruction.
-        void* buffer = context.allocate(arg.nbytes());
+        void* buffer = context.allocate(arg.nbytes(), 16);
         cfg->dcfg.outputs[i] = buffer;
       } else {
         cfg->dcfg.outputs[i] = arg.mutable_data_ptr();
diff --git a/backends/nxp/tests/test_neutron_backend.py b/backends/nxp/tests/test_neutron_backend.py
@@ -36,7 +36,10 @@ def test_neutron_backend__single_conv_model__payload_header_channels_last():
     assert payload[4] == 0x1  # Channels last 0-th Neutron output
     assert payload[5] == 0x0  # Map 0-th Neutron input to 0-th model input
     assert payload[6] == 0x0  # Map 0-th Neutron output to 0-th model output
-    assert all(byte == 0x0 for byte in payload[7:16])  # Aligned to 16 bytes
+    assert (
+        payload[7] == 0x0 or payload[7] == 0x1
+    )  # Payload version is 0 or 1 depending on the Neutron Software
+    assert all(byte == 0x0 for byte in payload[8:16])  # Aligned to 16 bytes
     assert payload[17] != 0x0  # Followed by non-zero content
 
 
@@ -53,5 +56,8 @@ def test_neutron_backend__linear_softmax_model__payload_header_formatless():
     assert payload[4] == 0x0  # Formatless 0-th Neutron output
     assert payload[5] == 0x0  # Map 0-th Neutron input to 0-th model input
     assert payload[6] == 0x0  # Map 0-th Neutron output to 0-th model output
-    assert all(byte == 0x0 for byte in payload[7:16])  # Aligned to 16 bytes
+    assert (
+        payload[7] == 0x0 or payload[7] == 0x1
+    )  # Payload version is 0 or 1 depending on the Neutron Software
+    assert all(byte == 0x0 for byte in payload[8:16])  # Aligned to 16 bytes
     assert payload[17] != 0x0  # Followed by non-zero content
diff --git a/backends/nxp/tests/test_neutron_backend_executor.py b/backends/nxp/tests/test_neutron_backend_executor.py
@@ -156,7 +156,7 @@ def test_delegating_format_related_transpose_operators__unsupported_shapes(mocke
 
     # Get the header of the payload for the delegated partition.
     payload_header = payload_header_spy.spy_return
-    assert payload_header.size == 7
+    assert payload_header.size == 8
     # the 4th and 5th bytes indicate the format. `1` means `channels_last`, which means the runtime will transpose the data.
     assert all(payload_header[3:5] == [1, 1])  # [<input_byte>, <output_byte>]
 
@@ -214,7 +214,7 @@ def test_delegating_format_related_transpose_operators__supported_case(mocker):
 
     # Get the header of the payload for the delegated partition.
     payload_header = payload_header_spy.spy_return
-    assert payload_header.size == 7
+    assert payload_header.size == 8
     # the 4th and 5th bytes indicate the format. `0` means `channels_last`, which means the runtime will NOT transpose the data.
     assert all(payload_header[3:5] == [0, 0])  # [<input_byte>, <output_byte>]
 
@@ -270,7 +270,7 @@ def test_delegating_format_related_transpose_operators__supported_output__unsupp
 
     # Get the header of the payload for the delegated partition.
     payload_header = payload_header_spy.spy_return
-    assert payload_header.size == 7
+    assert payload_header.size == 8
     # the 4th and 5th bytes indicate the format. `1` means `channels_last`, which means the runtime will transpose the data.
     assert all(payload_header[3:5] == [1, 0])  # [<input_byte>, <output_byte>]
 
@@ -322,6 +322,6 @@ def test_delegating_format_related_transpose_operators__supported_input__unsuppo
 
     # Get the header of the payload for the delegated partition.
     payload_header = payload_header_spy.spy_return
-    assert payload_header.size == 7
+    assert payload_header.size == 8
     # the 4th and 5th bytes indicate the format. `1` means `channels_last`, which means the runtime will transpose the data.
     assert all(payload_header[3:5] == [0, 1])  # [<input_byte>, <output_byte>]