From 158c5d8f109479ecfb9ca6ef5e638a4961f5b379 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com>
Date: Fri, 22 May 2026 17:39:32 -0700
Subject: [PATCH 01/91] Convert Android LLM extension from Java to Kotlin
(#19211)
Differential Revision: D102880053
Pull Request resolved: https://github.com/pytorch/executorch/pull/19211
---
extension/android/BUCK | 11 +-
.../android/executorch_android/build.gradle | 1 +
.../llm/{LlmCallback.java => LlmCallback.kt} | 27 +-
.../extension/llm/LlmGenerationConfig.java | 198 ----
.../extension/llm/LlmGenerationConfig.kt | 78 ++
.../executorch/extension/llm/LlmModule.java | 823 ----------------
.../executorch/extension/llm/LlmModule.kt | 898 ++++++++++++++++++
.../extension/llm/LlmModuleConfig.java | 252 -----
.../extension/llm/LlmModuleConfig.kt | 134 +++
.../extension/llm/package-info.java | 51 -
10 files changed, 1129 insertions(+), 1344 deletions(-)
rename extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/{LlmCallback.java => LlmCallback.kt} (53%)
delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.java
create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.kt
delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.kt
delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.java
create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.kt
delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/package-info.java
diff --git a/extension/android/BUCK b/extension/android/BUCK
index c7e275805e2..110b428575d 100644
--- a/extension/android/BUCK
+++ b/extension/android/BUCK
@@ -47,13 +47,14 @@ non_fbcode_target(_kind = fb_android_library,
name = "executorch_llama",
warnings_as_errors = False,
srcs = [
- "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java",
- "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.java",
- "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java",
- "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.java",
+ "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.kt",
+ "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.kt",
+ "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.kt",
+ "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.kt",
],
autoglob = False,
- language = "JAVA",
+ language = "KOTLIN",
+ extra_kotlinc_arguments = ["-Xjvm-default=all"],
deps = [
":executorch",
"//fbandroid/java/com/facebook/jni:jni",
diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle
index 3ee5b5877b3..2dbe0e1fb5f 100644
--- a/extension/android/executorch_android/build.gradle
+++ b/extension/android/executorch_android/build.gradle
@@ -51,6 +51,7 @@ android {
}
kotlinOptions {
jvmTarget = "11"
+ freeCompilerArgs += ["-Xjvm-default=all"]
}
}
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.kt
similarity index 53%
rename from extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java
rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.kt
index 4e834d06721..3b56986bf14 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.kt
@@ -6,45 +6,42 @@
* LICENSE file in the root directory of this source tree.
*/
-package org.pytorch.executorch.extension.llm;
+package org.pytorch.executorch.extension.llm
-import com.facebook.jni.annotations.DoNotStrip;
-import org.pytorch.executorch.annotations.Experimental;
+import com.facebook.jni.annotations.DoNotStrip
+import org.pytorch.executorch.annotations.Experimental
/**
- * Callback interface for Llama model. Users can implement this interface to receive the generated
+ * Callback interface for Llm model. Users can implement this interface to receive the generated
* tokens and statistics.
*
- *
Warning: These APIs are experimental and subject to change without notice
+ * Warning: These APIs are experimental and subject to change without notice
*/
@Experimental
-public interface LlmCallback {
+interface LlmCallback {
/**
* Called when a new result is available from JNI. Users will keep getting onResult() invocations
* until generate() finishes.
*
* @param result Last generated token
*/
- @DoNotStrip
- public void onResult(String result);
+ @DoNotStrip fun onResult(result: String)
/**
* Called when the statistics for the generate() is available.
*
- *
The result will be a JSON string. See extension/llm/stats.h for the field definitions.
+ * The result will be a JSON string. See extension/llm/stats.h for the field definitions.
*
* @param stats JSON string containing the statistics for the generate()
*/
- @DoNotStrip
- default void onStats(String stats) {}
+ @DoNotStrip fun onStats(stats: String) {}
/**
* Called when an error occurs during generate().
*
- * @param errorCode Error code from the ExecuTorch runtime (see {@link
- * org.pytorch.executorch.ExecutorchRuntimeException})
+ * @param errorCode Error code from the ExecuTorch runtime (see
+ * [org.pytorch.executorch.ExecutorchRuntimeException])
* @param message Human-readable error description
*/
- @DoNotStrip
- default void onError(int errorCode, String message) {}
+ @DoNotStrip fun onError(errorCode: Int, message: String) {}
}
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.java
deleted file mode 100644
index db7941aadad..00000000000
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.java
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package org.pytorch.executorch.extension.llm;
-
-/**
- * Configuration class for controlling text generation parameters in LLM operations.
- *
- *
This class provides settings for text generation behavior including output formatting,
- * generation limits, and sampling parameters. Instances should be created using the {@link
- * #create()} method and the fluent builder pattern.
- */
-public class LlmGenerationConfig {
- private final boolean echo;
- private final int maxNewTokens;
- private final boolean warming;
- private final int seqLen;
- private final float temperature;
- private final int numBos;
- private final int numEos;
-
- private LlmGenerationConfig(Builder builder) {
- this.echo = builder.echo;
- this.maxNewTokens = builder.maxNewTokens;
- this.warming = builder.warming;
- this.seqLen = builder.seqLen;
- this.temperature = builder.temperature;
- this.numBos = builder.numBos;
- this.numEos = builder.numEos;
- }
-
- /**
- * Creates a new Builder instance for constructing generation configurations.
- *
- * @return a new Builder with default configuration values
- */
- public static Builder create() {
- return new Builder();
- }
-
- /**
- * @return true if input prompt should be included in the output
- */
- public boolean isEcho() {
- return echo;
- }
-
- /**
- * @return maximum number of tokens to generate (-1 for unlimited)
- */
- public int getMaxNewTokens() {
- return maxNewTokens;
- }
-
- /**
- * @return true if model warming is enabled
- */
- public boolean isWarming() {
- return warming;
- }
-
- /**
- * @return maximum sequence length for generation (-1 for default)
- */
- public int getSeqLen() {
- return seqLen;
- }
-
- /**
- * @return temperature value for sampling (higher = more random)
- */
- public float getTemperature() {
- return temperature;
- }
-
- /**
- * @return number of BOS tokens to prepend
- */
- public int getNumBos() {
- return numBos;
- }
-
- /**
- * @return number of EOS tokens to append
- */
- public int getNumEos() {
- return numEos;
- }
-
- /**
- * Builder class for constructing LlmGenerationConfig instances.
- *
- *
Provides a fluent interface for configuring generation parameters with sensible defaults.
- * All methods return the builder instance to enable method chaining.
- */
- public static class Builder {
- private boolean echo = true;
- private int maxNewTokens = -1;
- private boolean warming = false;
- private int seqLen = -1;
- private float temperature = 0.8f;
- private int numBos = 0;
- private int numEos = 0;
-
- Builder() {}
-
- /**
- * Sets whether to include the input prompt in the generated output.
- *
- * @param echo true to include input prompt, false to return only new tokens
- * @return this builder instance
- */
- public Builder echo(boolean echo) {
- this.echo = echo;
- return this;
- }
-
- /**
- * Sets the maximum number of new tokens to generate.
- *
- * @param maxNewTokens the token limit (-1 for unlimited generation)
- * @return this builder instance
- */
- public Builder maxNewTokens(int maxNewTokens) {
- this.maxNewTokens = maxNewTokens;
- return this;
- }
-
- /**
- * Enables or disables model warming.
- *
- * @param warming true to generate initial tokens for model warmup
- * @return this builder instance
- */
- public Builder warming(boolean warming) {
- this.warming = warming;
- return this;
- }
-
- /**
- * Sets the maximum sequence length for generation.
- *
- * @param seqLen maximum sequence length (-1 for default behavior)
- * @return this builder instance
- */
- public Builder seqLen(int seqLen) {
- this.seqLen = seqLen;
- return this;
- }
-
- /**
- * Sets the temperature for random sampling.
- *
- * @param temperature sampling temperature (typical range 0.0-1.0)
- * @return this builder instance
- */
- public Builder temperature(float temperature) {
- this.temperature = temperature;
- return this;
- }
-
- /**
- * Sets the number of BOS tokens to prepend.
- *
- * @param numBos number of BOS tokens
- * @return this builder instance
- */
- public Builder numBos(int numBos) {
- this.numBos = numBos;
- return this;
- }
-
- /**
- * Sets the number of EOS tokens to append.
- *
- * @param numEos number of EOS tokens
- * @return this builder instance
- */
- public Builder numEos(int numEos) {
- this.numEos = numEos;
- return this;
- }
-
- /**
- * Constructs the LlmGenerationConfig instance with the configured parameters.
- *
- * @return new LlmGenerationConfig instance with current builder settings
- */
- public LlmGenerationConfig build() {
- return new LlmGenerationConfig(this);
- }
- }
-}
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.kt
new file mode 100644
index 00000000000..c0f8956fb7f
--- /dev/null
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.kt
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.executorch.extension.llm
+
+/**
+ * Configuration class for controlling text generation parameters in LLM operations.
+ *
+ * This class provides settings for text generation behavior including output formatting, generation
+ * limits, and sampling parameters. Instances should be created using the [create] method and the
+ * fluent builder pattern.
+ */
+class LlmGenerationConfig
+private constructor(
+ @get:JvmName("isEcho") val echo: Boolean,
+ val maxNewTokens: Int,
+ @get:JvmName("isWarming") val warming: Boolean,
+ val seqLen: Int,
+ val temperature: Float,
+ val numBos: Int,
+ val numEos: Int,
+) {
+
+ companion object {
+ /**
+ * Creates a new Builder instance for constructing generation configurations.
+ *
+ * @return a new Builder with default configuration values
+ */
+ @JvmStatic fun create(): Builder = Builder()
+ }
+
+ /**
+ * Builder class for constructing LlmGenerationConfig instances.
+ *
+ * Provides a fluent interface for configuring generation parameters with sensible defaults. All
+ * methods return the builder instance to enable method chaining.
+ */
+ class Builder internal constructor() {
+ private var echo: Boolean = true
+ private var maxNewTokens: Int = -1
+ private var warming: Boolean = false
+ private var seqLen: Int = -1
+ private var temperature: Float = 0.8f
+ private var numBos: Int = 0
+ private var numEos: Int = 0
+
+ /** Sets whether to include the input prompt in the generated output. */
+ fun echo(echo: Boolean): Builder = apply { this.echo = echo }
+
+ /** Sets the maximum number of new tokens to generate. */
+ fun maxNewTokens(maxNewTokens: Int): Builder = apply { this.maxNewTokens = maxNewTokens }
+
+ /** Enables or disables model warming. */
+ fun warming(warming: Boolean): Builder = apply { this.warming = warming }
+
+ /** Sets the maximum sequence length for generation. */
+ fun seqLen(seqLen: Int): Builder = apply { this.seqLen = seqLen }
+
+ /** Sets the temperature for random sampling. */
+ fun temperature(temperature: Float): Builder = apply { this.temperature = temperature }
+
+ /** Sets the number of BOS tokens to prepend. */
+ fun numBos(numBos: Int): Builder = apply { this.numBos = numBos }
+
+ /** Sets the number of EOS tokens to append. */
+ fun numEos(numEos: Int): Builder = apply { this.numEos = numEos }
+
+ /** Constructs the LlmGenerationConfig instance with the configured parameters. */
+ fun build(): LlmGenerationConfig =
+ LlmGenerationConfig(echo, maxNewTokens, warming, seqLen, temperature, numBos, numEos)
+ }
+}
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
deleted file mode 100644
index 0c467b13f44..00000000000
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
+++ /dev/null
@@ -1,823 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package org.pytorch.executorch.extension.llm;
-
-import com.facebook.jni.HybridData;
-import com.facebook.jni.annotations.DoNotStrip;
-import java.io.Closeable;
-import java.nio.ByteBuffer;
-import java.util.List;
-import java.util.concurrent.locks.ReentrantLock;
-import org.pytorch.executorch.ExecuTorchRuntime;
-import org.pytorch.executorch.ExecutorchRuntimeException;
-import org.pytorch.executorch.annotations.Experimental;
-
-/**
- * LlmModule is a wrapper around the Executorch LLM. It provides a simple interface to generate text
- * from the model.
- *
- *
Warning: These APIs are experimental and subject to change without notice
- */
-@Experimental
-public class LlmModule implements Closeable {
-
- public static final int MODEL_TYPE_TEXT = 1;
- public static final int MODEL_TYPE_TEXT_VISION = 2;
- public static final int MODEL_TYPE_MULTIMODAL = 2;
-
- private final HybridData mHybridData;
- private final ReentrantLock mLock = new ReentrantLock();
- private volatile boolean mDestroyed = false;
- private static final int DEFAULT_SEQ_LEN = 128;
- private static final boolean DEFAULT_ECHO = true;
- private static final float DEFAULT_TEMPERATURE = -1.0f;
- private static final int DEFAULT_BOS = 0;
- private static final int DEFAULT_EOS = 0;
- private static final int DEFAULT_LOAD_MODE = LlmModuleConfig.LOAD_MODE_MMAP;
-
- @DoNotStrip
- private static native HybridData initHybrid(
- int modelType,
- String modulePath,
- String tokenizerPath,
- float temperature,
- List dataFiles,
- int numBos,
- int numEos,
- int loadMode);
-
- private LlmModule(
- int modelType,
- String modulePath,
- String tokenizerPath,
- float temperature,
- List dataFiles,
- int numBos,
- int numEos,
- int loadMode) {
- ExecuTorchRuntime.getRuntime();
- ExecuTorchRuntime.validateFilePath(modulePath, "model path");
- ExecuTorchRuntime.validateFilePath(tokenizerPath, "tokenizer path");
-
- mHybridData =
- initHybrid(
- modelType, modulePath, tokenizerPath, temperature, dataFiles, numBos, numEos, loadMode);
- }
-
- /**
- * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
- * dataFiles.
- */
- public LlmModule(
- int modelType,
- String modulePath,
- String tokenizerPath,
- float temperature,
- List dataFiles,
- int numBos,
- int numEos) {
- this(
- modelType,
- modulePath,
- tokenizerPath,
- temperature,
- dataFiles,
- numBos,
- numEos,
- DEFAULT_LOAD_MODE);
- }
-
- /**
- * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
- * dataFiles.
- */
- public LlmModule(
- int modelType,
- String modulePath,
- String tokenizerPath,
- float temperature,
- List dataFiles) {
- this(
- modelType,
- modulePath,
- tokenizerPath,
- temperature,
- dataFiles,
- DEFAULT_BOS,
- DEFAULT_EOS,
- DEFAULT_LOAD_MODE);
- }
-
- /**
- * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
- * data path.
- */
- public LlmModule(
- int modelType,
- String modulePath,
- String tokenizerPath,
- float temperature,
- String dataPath,
- int numBos,
- int numEos) {
- this(
- modelType,
- modulePath,
- tokenizerPath,
- temperature,
- dataPath != null ? List.of(dataPath) : List.of(),
- numBos,
- numEos);
- }
-
- /**
- * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
- * data path.
- */
- public LlmModule(
- int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) {
- this(modelType, modulePath, tokenizerPath, temperature, dataPath, DEFAULT_BOS, DEFAULT_EOS);
- }
-
- /** Constructs a LLM Module for a model with given model path, tokenizer, temperature. */
- public LlmModule(String modulePath, String tokenizerPath, float temperature) {
- this(
- MODEL_TYPE_TEXT,
- modulePath,
- tokenizerPath,
- temperature,
- List.of(),
- DEFAULT_BOS,
- DEFAULT_EOS);
- }
-
- /**
- * Constructs a LLM Module for a model with given model path, tokenizer, temperature and data
- * path.
- */
- public LlmModule(String modulePath, String tokenizerPath, float temperature, String dataPath) {
- this(
- MODEL_TYPE_TEXT,
- modulePath,
- tokenizerPath,
- temperature,
- List.of(dataPath),
- DEFAULT_BOS,
- DEFAULT_EOS);
- }
-
- /** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */
- public LlmModule(int modelType, String modulePath, String tokenizerPath, float temperature) {
- this(modelType, modulePath, tokenizerPath, temperature, List.of(), DEFAULT_BOS, DEFAULT_EOS);
- }
-
- /** Constructs a LLM Module for a model with the given LlmModuleConfig */
- public LlmModule(LlmModuleConfig config) {
- this(
- config.getModelType(),
- config.getModulePath(),
- config.getTokenizerPath(),
- config.getTemperature(),
- config.getDataPath() != null ? List.of(config.getDataPath()) : List.of(),
- config.getNumBos(),
- config.getNumEos(),
- config.getLoadMode());
- }
-
- private void checkNotDestroyed() {
- if (mDestroyed) throw new IllegalStateException("LlmModule has been destroyed");
- }
-
- private void checkNotReentrant() {
- if (mLock.getHoldCount() > 1) {
- throw new IllegalStateException("Cannot call LlmModule methods from within a callback");
- }
- }
-
- /**
- * Releases native resources. Callers must ensure no other methods are in-flight. Call {@link
- * #stop()} and wait for {@link #generate(String, LlmCallback)} to return before calling this
- * method.
- */
- @Override
- public void close() {
- if (mLock.tryLock()) {
- try {
- if (mLock.getHoldCount() > 1) {
- throw new IllegalStateException(
- "Cannot close module from within a callback during execution");
- }
- if (!mDestroyed) {
- mDestroyed = true;
- mHybridData.resetNative();
- }
- } finally {
- mLock.unlock();
- }
- } else {
- throw new IllegalStateException("Cannot close module while method is executing");
- }
- }
-
- /**
- * @deprecated Use {@link #close()} instead.
- */
- @Deprecated
- public void resetNative() {
- close();
- }
-
- /**
- * Start generating tokens from the module.
- *
- * @param prompt Input prompt
- * @param llmCallback callback object to receive results.
- */
- public void generate(String prompt, LlmCallback llmCallback) {
- generate(
- prompt,
- DEFAULT_SEQ_LEN,
- llmCallback,
- DEFAULT_ECHO,
- DEFAULT_TEMPERATURE,
- DEFAULT_BOS,
- DEFAULT_EOS);
- }
-
- /**
- * Start generating tokens from the module.
- *
- * @param prompt Input prompt
- * @param seqLen sequence length
- * @param llmCallback callback object to receive results.
- */
- public void generate(String prompt, int seqLen, LlmCallback llmCallback) {
- generate(
- null,
- 0,
- 0,
- 0,
- prompt,
- seqLen,
- llmCallback,
- DEFAULT_ECHO,
- DEFAULT_TEMPERATURE,
- DEFAULT_BOS,
- DEFAULT_EOS);
- }
-
- /**
- * Start generating tokens from the module.
- *
- * @param prompt Input prompt
- * @param llmCallback callback object to receive results
- * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
- */
- public void generate(String prompt, LlmCallback llmCallback, boolean echo) {
- generate(
- null,
- 0,
- 0,
- 0,
- prompt,
- DEFAULT_SEQ_LEN,
- llmCallback,
- echo,
- DEFAULT_TEMPERATURE,
- DEFAULT_BOS,
- DEFAULT_EOS);
- }
-
- /**
- * Start generating tokens from the module.
- *
- * @param prompt Input prompt
- * @param seqLen sequence length
- * @param llmCallback callback object to receive results
- * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
- */
- public void generate(String prompt, int seqLen, LlmCallback llmCallback, boolean echo) {
- generate(prompt, seqLen, llmCallback, echo, DEFAULT_TEMPERATURE, DEFAULT_BOS, DEFAULT_EOS);
- }
-
- /**
- * Start generating tokens from the module.
- *
- * @param prompt Input prompt
- * @param seqLen sequence length
- * @param llmCallback callback object to receive results
- * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
- * @param temperature temperature for sampling (use negative value to use module default)
- * @param numBos number of BOS tokens to prepend
- * @param numEos number of EOS tokens to append
- */
- public void generate(
- String prompt,
- int seqLen,
- LlmCallback llmCallback,
- boolean echo,
- float temperature,
- int numBos,
- int numEos) {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- int err = generateNative(prompt, seqLen, llmCallback, echo, temperature, numBos, numEos);
- if (err != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to generate");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- @DoNotStrip
- private native int generateNative(
- String prompt,
- int seqLen,
- LlmCallback llmCallback,
- boolean echo,
- float temperature,
- int numBos,
- int numEos);
-
- /**
- * Start generating tokens from the module.
- *
- * @param prompt Input prompt
- * @param config the config for generation
- * @param llmCallback callback object to receive results
- */
- public void generate(String prompt, LlmGenerationConfig config, LlmCallback llmCallback) {
- int seqLen = config.getSeqLen();
- boolean echo = config.isEcho();
- float temperature = config.getTemperature();
- int numBos = config.getNumBos();
- int numEos = config.getNumEos();
- generate(null, 0, 0, 0, prompt, seqLen, llmCallback, echo, temperature, numBos, numEos);
- }
-
- /**
- * Start generating tokens from the module.
- *
- * @param image Input image as a byte array
- * @param width Input image width
- * @param height Input image height
- * @param channels Input image number of channels
- * @param prompt Input prompt
- * @param seqLen sequence length
- * @param llmCallback callback object to receive results.
- * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
- */
- public void generate(
- int[] image,
- int width,
- int height,
- int channels,
- String prompt,
- int seqLen,
- LlmCallback llmCallback,
- boolean echo) {
- generate(
- image,
- width,
- height,
- channels,
- prompt,
- seqLen,
- llmCallback,
- echo,
- DEFAULT_TEMPERATURE,
- DEFAULT_BOS,
- DEFAULT_EOS);
- }
-
- /**
- * Start generating tokens from the module.
- *
- * @param image Input image as a byte array
- * @param width Input image width
- * @param height Input image height
- * @param channels Input image number of channels
- * @param prompt Input prompt
- * @param seqLen sequence length
- * @param llmCallback callback object to receive results.
- * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
- * @param temperature temperature for sampling (use negative value to use module default)
- */
- public void generate(
- int[] image,
- int width,
- int height,
- int channels,
- String prompt,
- int seqLen,
- LlmCallback llmCallback,
- boolean echo,
- float temperature) {
- generate(
- image,
- width,
- height,
- channels,
- prompt,
- seqLen,
- llmCallback,
- echo,
- temperature,
- DEFAULT_BOS,
- DEFAULT_EOS);
- }
-
- /**
- * Start generating tokens from the module.
- *
- * @param image Input image as a byte array
- * @param width Input image width
- * @param height Input image height
- * @param channels Input image number of channels
- * @param prompt Input prompt
- * @param seqLen sequence length
- * @param llmCallback callback object to receive results.
- * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
- * @param temperature temperature for sampling (use negative value to use module default)
- * @param numBos number of BOS tokens to prepend
- * @param numEos number of EOS tokens to append
- */
- public void generate(
- int[] image,
- int width,
- int height,
- int channels,
- String prompt,
- int seqLen,
- LlmCallback llmCallback,
- boolean echo,
- float temperature,
- int numBos,
- int numEos) {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- if (image != null) {
- int nativeResult = prefillImagesInput(image, width, height, channels);
- if (nativeResult != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed");
- }
- }
- int err = generateNative(prompt, seqLen, llmCallback, echo, temperature, numBos, numEos);
- if (err != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to generate");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- /**
- * Prefill the KV cache with the given image input.
- *
- * @param image Input image as a byte array
- * @param width Input image width
- * @param height Input image height
- * @param channels Input image number of channels
- * @throws ExecutorchRuntimeException if the prefill failed
- */
- @Experimental
- public void prefillImages(int[] image, int width, int height, int channels) {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- int nativeResult = prefillImagesInput(image, width, height, channels);
- if (nativeResult != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- /**
- * Prefill a multimodal Module with the given image input via a direct ByteBuffer. The buffer data
- * is accessed directly without JNI array copies, unlike {@link #prefillImages(int[], int, int,
- * int)}. The ByteBuffer must contain raw uint8 pixel data in CHW format with at least channels *
- * height * width bytes remaining. Only the first channels * height * width bytes from the
- * buffer's current position are read; the position of the original ByteBuffer is not modified.
- *
- * @param image Input image as a direct ByteBuffer containing uint8 pixel data
- * @param width Input image width
- * @param height Input image height
- * @param channels Input image number of channels
- * @throws IllegalArgumentException if the ByteBuffer is not direct or has insufficient remaining
- * bytes
- * @throws ExecutorchRuntimeException if the prefill failed
- */
- @Experimental
- public void prefillImages(ByteBuffer image, int width, int height, int channels) {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- if (!image.isDirect()) {
- throw new IllegalArgumentException("Input ByteBuffer must be direct.");
- }
- long expectedBytes;
- try {
- long pixels = Math.multiplyExact((long) width, (long) height);
- expectedBytes = Math.multiplyExact(pixels, (long) channels);
- } catch (ArithmeticException ex) {
- throw new IllegalArgumentException(
- "width*height*channels is too large and overflows the allowed range.", ex);
- }
- if (width <= 0
- || height <= 0
- || channels <= 0
- || expectedBytes > Integer.MAX_VALUE
- || image.remaining() < expectedBytes) {
- throw new IllegalArgumentException(
- "ByteBuffer remaining ("
- + image.remaining()
- + ") must be at least width*height*channels ("
- + expectedBytes
- + ").");
- }
- // slice() so that getDirectBufferAddress on the native side returns a pointer
- // starting at the current position, not the base address.
- int nativeResult = prefillImagesInputBuffer(image.slice(), width, height, channels);
- if (nativeResult != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- /**
- * Prefill a multimodal Module with the given normalized image input via a direct ByteBuffer. The
- * buffer data is accessed directly without JNI array copies, unlike {@link
- * #prefillImages(float[], int, int, int)}. The ByteBuffer must contain normalized float pixel
- * data in CHW format with at least channels * height * width * 4 bytes remaining. Only the first
- * channels * height * width floats from the buffer's current position are consumed. The buffer
- * must use the platform's native byte order (set via {@code
- * buffer.order(ByteOrder.nativeOrder())}).
- *
- * @param image Input normalized image as a direct ByteBuffer containing float pixel data in
- * native byte order
- * @param width Input image width
- * @param height Input image height
- * @param channels Input image number of channels
- * @throws IllegalArgumentException if the ByteBuffer is not direct, has insufficient remaining
- * bytes, is not float-aligned, or does not use native byte order
- * @throws ExecutorchRuntimeException if the prefill failed
- */
- @Experimental
- public void prefillNormalizedImage(ByteBuffer image, int width, int height, int channels) {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- if (!image.isDirect()) {
- throw new IllegalArgumentException("Input ByteBuffer must be direct.");
- }
- if (image.order() != java.nio.ByteOrder.nativeOrder()) {
- throw new IllegalArgumentException(
- "Input ByteBuffer must use native byte order (ByteOrder.nativeOrder()).");
- }
- if (image.position() % Float.BYTES != 0) {
- throw new IllegalArgumentException(
- "Input ByteBuffer position (" + image.position() + ") must be 4-byte aligned.");
- }
- final long expectedBytes;
- try {
- int wh = Math.multiplyExact(width, height);
- long whc = Math.multiplyExact((long) wh, (long) channels);
- long totalBytes = Math.multiplyExact(whc, (long) Float.BYTES);
- if (totalBytes > Integer.MAX_VALUE) {
- throw new IllegalArgumentException(
- "ByteBuffer size (width*height*channels*4) exceeds Integer.MAX_VALUE bytes: "
- + totalBytes);
- }
- expectedBytes = totalBytes;
- } catch (ArithmeticException e) {
- throw new IllegalArgumentException(
- "Overflow while computing width*height*channels*4 for ByteBuffer size.", e);
- }
- if (width <= 0 || height <= 0 || channels <= 0 || image.remaining() < expectedBytes) {
- throw new IllegalArgumentException(
- "ByteBuffer remaining ("
- + image.remaining()
- + ") must be at least width*height*channels*4 ("
- + expectedBytes
- + ").");
- }
- if (image.remaining() % Float.BYTES != 0) {
- throw new IllegalArgumentException(
- "ByteBuffer remaining ("
- + image.remaining()
- + ") must be a multiple of 4 (float size).");
- }
- // slice() so that getDirectBufferAddress on the native side returns a pointer
- // starting at the current position, not the base address.
- int nativeResult = prefillNormalizedImagesInputBuffer(image.slice(), width, height, channels);
- if (nativeResult != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- private native int prefillImagesInput(int[] image, int width, int height, int channels);
-
- private native int prefillImagesInputBuffer(
- ByteBuffer image, int width, int height, int channels);
-
- private native int prefillNormalizedImagesInputBuffer(
- ByteBuffer image, int width, int height, int channels);
-
- /**
- * Prefill the KV cache with the given normalized image input.
- *
- * @param image Input normalized image as a float array
- * @param width Input image width
- * @param height Input image height
- * @param channels Input image number of channels
- * @throws ExecutorchRuntimeException if the prefill failed
- */
- @Experimental
- public void prefillImages(float[] image, int width, int height, int channels) {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- int nativeResult = prefillNormalizedImagesInput(image, width, height, channels);
- if (nativeResult != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- private native int prefillNormalizedImagesInput(
- float[] image, int width, int height, int channels);
-
- /**
- * Prefill the KV cache with the given preprocessed audio input.
- *
- * @param audio Input preprocessed audio as a byte array
- * @param batch_size Input batch size
- * @param n_bins Input number of bins
- * @param n_frames Input number of frames
- * @throws ExecutorchRuntimeException if the prefill failed
- */
- @Experimental
- public void prefillAudio(byte[] audio, int batch_size, int n_bins, int n_frames) {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- int nativeResult = prefillAudioInput(audio, batch_size, n_bins, n_frames);
- if (nativeResult != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- private native int prefillAudioInput(byte[] audio, int batch_size, int n_bins, int n_frames);
-
- /**
- * Prefill the KV cache with the given preprocessed audio input.
- *
- * @param audio Input preprocessed audio as a float array
- * @param batch_size Input batch size
- * @param n_bins Input number of bins
- * @param n_frames Input number of frames
- * @throws ExecutorchRuntimeException if the prefill failed
- */
- @Experimental
- public void prefillAudio(float[] audio, int batch_size, int n_bins, int n_frames) {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- int nativeResult = prefillAudioInputFloat(audio, batch_size, n_bins, n_frames);
- if (nativeResult != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- private native int prefillAudioInputFloat(
- float[] audio, int batch_size, int n_bins, int n_frames);
-
- /**
- * Prefill the KV cache with the given raw audio input.
- *
- * @param audio Input raw audio as a byte array
- * @param batch_size Input batch size
- * @param n_channels Input number of channels
- * @param n_samples Input number of samples
- * @throws ExecutorchRuntimeException if the prefill failed
- */
- @Experimental
- public void prefillRawAudio(byte[] audio, int batch_size, int n_channels, int n_samples) {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- int nativeResult = prefillRawAudioInput(audio, batch_size, n_channels, n_samples);
- if (nativeResult != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- private native int prefillRawAudioInput(
- byte[] audio, int batch_size, int n_channels, int n_samples);
-
- /**
- * Prefill the KV cache with the given text prompt.
- *
- * @param prompt The text prompt to prefill.
- * @throws ExecutorchRuntimeException if the prefill failed
- */
- @Experimental
- public void prefillPrompt(String prompt) {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- int nativeResult = prefillTextInput(prompt);
- if (nativeResult != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- // returns status
- private native int prefillTextInput(String prompt);
-
- /**
- * Reset the context of the LLM. This will clear the KV cache and reset the state of the LLM.
- *
- * The startPos will be reset to 0.
- */
- public void resetContext() {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- resetContextNative();
- } finally {
- mLock.unlock();
- }
- }
-
- @DoNotStrip
- private native void resetContextNative();
-
- /** Stop current generate() before it finishes. */
- public void stop() {
- if (mDestroyed) return;
- stopNative();
- }
-
- @DoNotStrip
- private native void stopNative();
-
- /** Force loading the module. Otherwise the model is loaded during first generate(). */
- public void load() {
- mLock.lock();
- try {
- checkNotReentrant();
- checkNotDestroyed();
- int err = loadNative();
- if (err != 0) {
- throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to load model");
- }
- } finally {
- mLock.unlock();
- }
- }
-
- @DoNotStrip
- private native int loadNative();
-}
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.kt
new file mode 100644
index 00000000000..f95e796b83b
--- /dev/null
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.kt
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.executorch.extension.llm
+
+import com.facebook.jni.HybridData
+import com.facebook.jni.annotations.DoNotStrip
+import java.io.Closeable
+import java.nio.ByteBuffer
+import java.nio.ByteOrder
+import java.util.concurrent.locks.ReentrantLock
+import org.pytorch.executorch.ExecuTorchRuntime
+import org.pytorch.executorch.ExecutorchRuntimeException
+import org.pytorch.executorch.annotations.Experimental
+
+/**
+ * LlmModule is a wrapper around the Executorch LLM. It provides a simple interface to generate text
+ * from the model.
+ *
+ * Warning: These APIs are experimental and subject to change without notice
+ */
+@Experimental
+class LlmModule
+private constructor(
+ modelType: Int,
+ modulePath: String,
+ tokenizerPath: String,
+ temperature: Float,
+ dataFiles: List,
+ numBos: Int,
+ numEos: Int,
+ loadMode: Int,
+) : Closeable {
+
+ private val mHybridData: HybridData
+ private val mLock = ReentrantLock()
+ @Volatile private var mDestroyed = false
+
+ init {
+ ExecuTorchRuntime.getRuntime()
+ ExecuTorchRuntime.validateFilePath(modulePath, "model path")
+ ExecuTorchRuntime.validateFilePath(tokenizerPath, "tokenizer path")
+ mHybridData =
+ initHybrid(
+ modelType,
+ modulePath,
+ tokenizerPath,
+ temperature,
+ dataFiles,
+ numBos,
+ numEos,
+ loadMode,
+ )
+ }
+
+ /**
+ * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
+ * dataFiles.
+ */
+ constructor(
+ modelType: Int,
+ modulePath: String,
+ tokenizerPath: String,
+ temperature: Float,
+ dataFiles: List,
+ numBos: Int,
+ numEos: Int,
+ ) : this(
+ modelType,
+ modulePath,
+ tokenizerPath,
+ temperature,
+ dataFiles,
+ numBos,
+ numEos,
+ DEFAULT_LOAD_MODE,
+ )
+
+ /**
+ * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
+ * dataFiles.
+ */
+ constructor(
+ modelType: Int,
+ modulePath: String,
+ tokenizerPath: String,
+ temperature: Float,
+ dataFiles: List,
+ ) : this(
+ modelType,
+ modulePath,
+ tokenizerPath,
+ temperature,
+ dataFiles,
+ DEFAULT_BOS,
+ DEFAULT_EOS,
+ DEFAULT_LOAD_MODE,
+ )
+
+ /**
+ * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
+ * data path.
+ */
+ constructor(
+ modelType: Int,
+ modulePath: String,
+ tokenizerPath: String,
+ temperature: Float,
+ dataPath: String?,
+ numBos: Int,
+ numEos: Int,
+ ) : this(
+ modelType,
+ modulePath,
+ tokenizerPath,
+ temperature,
+ listOfNotNull(dataPath),
+ numBos,
+ numEos,
+ )
+
+ /**
+ * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
+ * data path.
+ */
+ constructor(
+ modelType: Int,
+ modulePath: String,
+ tokenizerPath: String,
+ temperature: Float,
+ dataPath: String?,
+ ) : this(
+ modelType,
+ modulePath,
+ tokenizerPath,
+ temperature,
+ dataPath,
+ DEFAULT_BOS,
+ DEFAULT_EOS,
+ )
+
+ /** Constructs a LLM Module for a model with given model path, tokenizer, temperature. */
+ constructor(
+ modulePath: String,
+ tokenizerPath: String,
+ temperature: Float,
+ ) : this(
+ MODEL_TYPE_TEXT,
+ modulePath,
+ tokenizerPath,
+ temperature,
+ emptyList(),
+ DEFAULT_BOS,
+ DEFAULT_EOS,
+ )
+
+ /**
+ * Constructs a LLM Module for a model with given model path, tokenizer, temperature and data
+ * path.
+ */
+ constructor(
+ modulePath: String,
+ tokenizerPath: String,
+ temperature: Float,
+ dataPath: String,
+ ) : this(
+ MODEL_TYPE_TEXT,
+ modulePath,
+ tokenizerPath,
+ temperature,
+ listOf(dataPath),
+ DEFAULT_BOS,
+ DEFAULT_EOS,
+ )
+
+ /** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */
+ constructor(
+ modelType: Int,
+ modulePath: String,
+ tokenizerPath: String,
+ temperature: Float,
+ ) : this(
+ modelType,
+ modulePath,
+ tokenizerPath,
+ temperature,
+ emptyList(),
+ DEFAULT_BOS,
+ DEFAULT_EOS,
+ )
+
+ /** Constructs a LLM Module for a model with the given LlmModuleConfig */
+ constructor(
+ config: LlmModuleConfig
+ ) : this(
+ config.modelType,
+ config.modulePath,
+ config.tokenizerPath,
+ config.temperature,
+ listOfNotNull(config.dataPath),
+ config.numBos,
+ config.numEos,
+ config.loadMode,
+ )
+
+ private fun checkNotDestroyed() {
+ if (mDestroyed) throw IllegalStateException("LlmModule has been destroyed")
+ }
+
+ private fun checkNotReentrant() {
+ if (mLock.holdCount > 1) {
+ throw IllegalStateException("Cannot call LlmModule methods from within a callback")
+ }
+ }
+
+ /**
+ * Releases native resources. Callers must ensure no other methods are in-flight. Call [stop] and
+ * wait for [generate] to return before calling this method.
+ */
+ override fun close() {
+ if (mLock.tryLock()) {
+ try {
+ if (mLock.holdCount > 1) {
+ throw IllegalStateException("Cannot close module from within a callback during execution")
+ }
+ if (!mDestroyed) {
+ mDestroyed = true
+ mHybridData.resetNative()
+ }
+ } finally {
+ mLock.unlock()
+ }
+ } else {
+ throw IllegalStateException("Cannot close module while method is executing")
+ }
+ }
+
+ /** @deprecated Use [close] instead. */
+ @Deprecated("Use close() instead", replaceWith = ReplaceWith("close()"))
+ fun resetNative() {
+ close()
+ }
+
+ // --- generate overloads ---
+
+ /**
+ * Start generating tokens from the module.
+ *
+ * @param prompt Input prompt
+ * @param llmCallback callback object to receive results.
+ */
+ fun generate(prompt: String, llmCallback: LlmCallback) {
+ generate(
+ prompt,
+ DEFAULT_SEQ_LEN,
+ llmCallback,
+ DEFAULT_ECHO,
+ DEFAULT_TEMPERATURE,
+ DEFAULT_BOS,
+ DEFAULT_EOS,
+ )
+ }
+
+ /**
+ * Start generating tokens from the module.
+ *
+ * @param prompt Input prompt
+ * @param seqLen sequence length
+ * @param llmCallback callback object to receive results.
+ */
+ fun generate(prompt: String, seqLen: Int, llmCallback: LlmCallback) {
+ generate(
+ null,
+ 0,
+ 0,
+ 0,
+ prompt,
+ seqLen,
+ llmCallback,
+ DEFAULT_ECHO,
+ DEFAULT_TEMPERATURE,
+ DEFAULT_BOS,
+ DEFAULT_EOS,
+ )
+ }
+
+ /**
+ * Start generating tokens from the module.
+ *
+ * @param prompt Input prompt
+ * @param llmCallback callback object to receive results
+ * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
+ */
+ fun generate(prompt: String, llmCallback: LlmCallback, echo: Boolean) {
+ generate(
+ null,
+ 0,
+ 0,
+ 0,
+ prompt,
+ DEFAULT_SEQ_LEN,
+ llmCallback,
+ echo,
+ DEFAULT_TEMPERATURE,
+ DEFAULT_BOS,
+ DEFAULT_EOS,
+ )
+ }
+
+ /**
+ * Start generating tokens from the module.
+ *
+ * @param prompt Input prompt
+ * @param seqLen sequence length
+ * @param llmCallback callback object to receive results
+ * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
+ */
+ fun generate(prompt: String, seqLen: Int, llmCallback: LlmCallback, echo: Boolean) {
+ generate(prompt, seqLen, llmCallback, echo, DEFAULT_TEMPERATURE, DEFAULT_BOS, DEFAULT_EOS)
+ }
+
+ /**
+ * Start generating tokens from the module.
+ *
+ * @param prompt Input prompt
+ * @param seqLen sequence length
+ * @param llmCallback callback object to receive results
+ * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
+ * @param temperature temperature for sampling (use negative value to use module default)
+ * @param numBos number of BOS tokens to prepend
+ * @param numEos number of EOS tokens to append
+ */
+ fun generate(
+ prompt: String,
+ seqLen: Int,
+ llmCallback: LlmCallback,
+ echo: Boolean,
+ temperature: Float,
+ numBos: Int,
+ numEos: Int,
+ ) {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ val err = generateNative(prompt, seqLen, llmCallback, echo, temperature, numBos, numEos)
+ if (err != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to generate")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ @DoNotStrip
+ private external fun generateNative(
+ prompt: String,
+ seqLen: Int,
+ llmCallback: LlmCallback,
+ echo: Boolean,
+ temperature: Float,
+ numBos: Int,
+ numEos: Int,
+ ): Int
+
+ /**
+ * Start generating tokens from the module.
+ *
+ * @param prompt Input prompt
+ * @param config the config for generation
+ * @param llmCallback callback object to receive results
+ */
+ fun generate(prompt: String, config: LlmGenerationConfig, llmCallback: LlmCallback) {
+ generate(
+ null,
+ 0,
+ 0,
+ 0,
+ prompt,
+ config.seqLen,
+ llmCallback,
+ config.echo,
+ config.temperature,
+ config.numBos,
+ config.numEos,
+ )
+ }
+
+ /**
+ * Start generating tokens from the module.
+ *
+ * @param image Input image as a byte array
+ * @param width Input image width
+ * @param height Input image height
+ * @param channels Input image number of channels
+ * @param prompt Input prompt
+ * @param seqLen sequence length
+ * @param llmCallback callback object to receive results.
+ * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
+ */
+ fun generate(
+ image: IntArray?,
+ width: Int,
+ height: Int,
+ channels: Int,
+ prompt: String,
+ seqLen: Int,
+ llmCallback: LlmCallback,
+ echo: Boolean,
+ ) {
+ generate(
+ image,
+ width,
+ height,
+ channels,
+ prompt,
+ seqLen,
+ llmCallback,
+ echo,
+ DEFAULT_TEMPERATURE,
+ DEFAULT_BOS,
+ DEFAULT_EOS,
+ )
+ }
+
+ /**
+ * Start generating tokens from the module.
+ *
+ * @param image Input image as a byte array
+ * @param width Input image width
+ * @param height Input image height
+ * @param channels Input image number of channels
+ * @param prompt Input prompt
+ * @param seqLen sequence length
+ * @param llmCallback callback object to receive results.
+ * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
+ * @param temperature temperature for sampling (use negative value to use module default)
+ */
+ fun generate(
+ image: IntArray?,
+ width: Int,
+ height: Int,
+ channels: Int,
+ prompt: String,
+ seqLen: Int,
+ llmCallback: LlmCallback,
+ echo: Boolean,
+ temperature: Float,
+ ) {
+ generate(
+ image,
+ width,
+ height,
+ channels,
+ prompt,
+ seqLen,
+ llmCallback,
+ echo,
+ temperature,
+ DEFAULT_BOS,
+ DEFAULT_EOS,
+ )
+ }
+
+ /**
+ * Start generating tokens from the module.
+ *
+ * @param image Input image as a byte array
+ * @param width Input image width
+ * @param height Input image height
+ * @param channels Input image number of channels
+ * @param prompt Input prompt
+ * @param seqLen sequence length
+ * @param llmCallback callback object to receive results.
+ * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
+ * @param temperature temperature for sampling (use negative value to use module default)
+ * @param numBos number of BOS tokens to prepend
+ * @param numEos number of EOS tokens to append
+ */
+ fun generate(
+ image: IntArray?,
+ width: Int,
+ height: Int,
+ channels: Int,
+ prompt: String,
+ seqLen: Int,
+ llmCallback: LlmCallback,
+ echo: Boolean,
+ temperature: Float,
+ numBos: Int,
+ numEos: Int,
+ ) {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ if (image != null) {
+ val nativeResult = prefillImagesInput(image, width, height, channels)
+ if (nativeResult != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed")
+ }
+ }
+ val err = generateNative(prompt, seqLen, llmCallback, echo, temperature, numBos, numEos)
+ if (err != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to generate")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ // --- prefill methods ---
+
+ /**
+ * Prefill the KV cache with the given image input.
+ *
+ * @param image Input image as a byte array
+ * @param width Input image width
+ * @param height Input image height
+ * @param channels Input image number of channels
+ * @throws ExecutorchRuntimeException if the prefill failed
+ */
+ @Experimental
+ fun prefillImages(image: IntArray, width: Int, height: Int, channels: Int) {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ val nativeResult = prefillImagesInput(image, width, height, channels)
+ if (nativeResult != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ /**
+ * Prefill a multimodal Module with the given image input via a direct ByteBuffer. The buffer data
+ * is accessed directly without JNI array copies, unlike [prefillImages]. The ByteBuffer must
+ * contain raw uint8 pixel data in CHW format with at least channels * height * width bytes
+ * remaining. Only the first channels * height * width bytes from the buffer's current position
+ * are read; the position of the original ByteBuffer is not modified.
+ *
+ * @param image Input image as a direct ByteBuffer containing uint8 pixel data
+ * @param width Input image width
+ * @param height Input image height
+ * @param channels Input image number of channels
+ * @throws IllegalArgumentException if the ByteBuffer is not direct or has insufficient remaining
+ * bytes
+ * @throws ExecutorchRuntimeException if the prefill failed
+ */
+ @Experimental
+ fun prefillImages(image: ByteBuffer, width: Int, height: Int, channels: Int) {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ require(image.isDirect) { "Input ByteBuffer must be direct." }
+ val expectedBytes: Long
+ try {
+ val pixels = Math.multiplyExact(width.toLong(), height.toLong())
+ expectedBytes = Math.multiplyExact(pixels, channels.toLong())
+ } catch (ex: ArithmeticException) {
+ throw IllegalArgumentException(
+ "width*height*channels is too large and overflows the allowed range.",
+ ex,
+ )
+ }
+ require(
+ width > 0 &&
+ height > 0 &&
+ channels > 0 &&
+ expectedBytes <= Int.MAX_VALUE.toLong() &&
+ image.remaining().toLong() >= expectedBytes
+ ) {
+ "ByteBuffer remaining (${image.remaining()}) must be at least width*height*channels ($expectedBytes)."
+ }
+ // slice() so that getDirectBufferAddress on the native side returns a pointer
+ // starting at the current position, not the base address.
+ val nativeResult = prefillImagesInputBuffer(image.slice(), width, height, channels)
+ if (nativeResult != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ /**
+ * Prefill a multimodal Module with the given normalized image input via a direct ByteBuffer. The
+ * buffer data is accessed directly without JNI array copies, unlike [prefillImages]. The
+ * ByteBuffer must contain normalized float pixel data in CHW format with at least channels *
+ * height * width * 4 bytes remaining. Only the first channels * height * width floats from the
+ * buffer's current position are consumed. The buffer must use the platform's native byte order
+ * (set via `buffer.order(ByteOrder.nativeOrder())`).
+ *
+ * @param image Input normalized image as a direct ByteBuffer containing float pixel data in
+ * native byte order
+ * @param width Input image width
+ * @param height Input image height
+ * @param channels Input image number of channels
+ * @throws IllegalArgumentException if the ByteBuffer is not direct, has insufficient remaining
+ * bytes, is not float-aligned, or does not use native byte order
+ * @throws ExecutorchRuntimeException if the prefill failed
+ */
+ @Experimental
+ fun prefillNormalizedImage(image: ByteBuffer, width: Int, height: Int, channels: Int) {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ require(image.isDirect) { "Input ByteBuffer must be direct." }
+ require(image.order() == ByteOrder.nativeOrder()) {
+ "Input ByteBuffer must use native byte order (ByteOrder.nativeOrder())."
+ }
+ require(image.position() % Float.SIZE_BYTES == 0) {
+ "Input ByteBuffer position (${image.position()}) must be 4-byte aligned."
+ }
+ val expectedBytes: Long
+ try {
+ val wh = Math.multiplyExact(width, height)
+ val whc = Math.multiplyExact(wh.toLong(), channels.toLong())
+ val totalBytes = Math.multiplyExact(whc, Float.SIZE_BYTES.toLong())
+ if (totalBytes > Int.MAX_VALUE.toLong()) {
+ throw IllegalArgumentException(
+ "ByteBuffer size (width*height*channels*4) exceeds Integer.MAX_VALUE bytes: $totalBytes",
+ )
+ }
+ expectedBytes = totalBytes
+ } catch (e: ArithmeticException) {
+ throw IllegalArgumentException(
+ "Overflow while computing width*height*channels*4 for ByteBuffer size.",
+ e,
+ )
+ }
+ require(
+ width > 0 && height > 0 && channels > 0 && image.remaining().toLong() >= expectedBytes
+ ) {
+ "ByteBuffer remaining (${image.remaining()}) must be at least width*height*channels*4 ($expectedBytes)."
+ }
+ require(image.remaining() % Float.SIZE_BYTES == 0) {
+ "ByteBuffer remaining (${image.remaining()}) must be a multiple of 4 (float size)."
+ }
+ // slice() so that getDirectBufferAddress on the native side returns a pointer
+ // starting at the current position, not the base address.
+ val nativeResult = prefillNormalizedImagesInputBuffer(image.slice(), width, height, channels)
+ if (nativeResult != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ private external fun prefillImagesInput(
+ image: IntArray,
+ width: Int,
+ height: Int,
+ channels: Int,
+ ): Int
+
+ private external fun prefillImagesInputBuffer(
+ image: ByteBuffer,
+ width: Int,
+ height: Int,
+ channels: Int,
+ ): Int
+
+ private external fun prefillNormalizedImagesInputBuffer(
+ image: ByteBuffer,
+ width: Int,
+ height: Int,
+ channels: Int,
+ ): Int
+
+ /**
+ * Prefill the KV cache with the given normalized image input.
+ *
+ * @param image Input normalized image as a float array
+ * @param width Input image width
+ * @param height Input image height
+ * @param channels Input image number of channels
+ * @throws ExecutorchRuntimeException if the prefill failed
+ */
+ @Experimental
+ fun prefillImages(image: FloatArray, width: Int, height: Int, channels: Int) {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ val nativeResult = prefillNormalizedImagesInput(image, width, height, channels)
+ if (nativeResult != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ private external fun prefillNormalizedImagesInput(
+ image: FloatArray,
+ width: Int,
+ height: Int,
+ channels: Int,
+ ): Int
+
+ /**
+ * Prefill the KV cache with the given preprocessed audio input.
+ *
+ * @param audio Input preprocessed audio as a byte array
+ * @param batchSize Input batch size
+ * @param nBins Input number of bins
+ * @param nFrames Input number of frames
+ * @throws ExecutorchRuntimeException if the prefill failed
+ */
+ @Experimental
+ fun prefillAudio(audio: ByteArray, batchSize: Int, nBins: Int, nFrames: Int) {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ val nativeResult = prefillAudioInput(audio, batchSize, nBins, nFrames)
+ if (nativeResult != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ private external fun prefillAudioInput(
+ audio: ByteArray,
+ batchSize: Int,
+ nBins: Int,
+ nFrames: Int,
+ ): Int
+
+ /**
+ * Prefill the KV cache with the given preprocessed audio input.
+ *
+ * @param audio Input preprocessed audio as a float array
+ * @param batchSize Input batch size
+ * @param nBins Input number of bins
+ * @param nFrames Input number of frames
+ * @throws ExecutorchRuntimeException if the prefill failed
+ */
+ @Experimental
+ fun prefillAudio(audio: FloatArray, batchSize: Int, nBins: Int, nFrames: Int) {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ val nativeResult = prefillAudioInputFloat(audio, batchSize, nBins, nFrames)
+ if (nativeResult != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ private external fun prefillAudioInputFloat(
+ audio: FloatArray,
+ batchSize: Int,
+ nBins: Int,
+ nFrames: Int,
+ ): Int
+
+ /**
+ * Prefill the KV cache with the given raw audio input.
+ *
+ * @param audio Input raw audio as a byte array
+ * @param batchSize Input batch size
+ * @param nChannels Input number of channels
+ * @param nSamples Input number of samples
+ * @throws ExecutorchRuntimeException if the prefill failed
+ */
+ @Experimental
+ fun prefillRawAudio(audio: ByteArray, batchSize: Int, nChannels: Int, nSamples: Int) {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ val nativeResult = prefillRawAudioInput(audio, batchSize, nChannels, nSamples)
+ if (nativeResult != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ private external fun prefillRawAudioInput(
+ audio: ByteArray,
+ batchSize: Int,
+ nChannels: Int,
+ nSamples: Int,
+ ): Int
+
+ /**
+ * Prefill the KV cache with the given text prompt.
+ *
+ * @param prompt The text prompt to prefill.
+ * @throws ExecutorchRuntimeException if the prefill failed
+ */
+ @Experimental
+ fun prefillPrompt(prompt: String) {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ val nativeResult = prefillTextInput(prompt)
+ if (nativeResult != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ // returns status
+ private external fun prefillTextInput(prompt: String): Int
+
+ /**
+ * Reset the context of the LLM. This will clear the KV cache and reset the state of the LLM.
+ *
+ * The startPos will be reset to 0.
+ */
+ fun resetContext() {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ resetContextNative()
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ @DoNotStrip private external fun resetContextNative()
+
+ /** Stop current generate() before it finishes. */
+ fun stop() {
+ if (mDestroyed) return
+ stopNative()
+ }
+
+ @DoNotStrip private external fun stopNative()
+
+ /** Force loading the module. Otherwise the model is loaded during first generate(). */
+ fun load() {
+ mLock.lock()
+ try {
+ checkNotReentrant()
+ checkNotDestroyed()
+ val err = loadNative()
+ if (err != 0) {
+ throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to load model")
+ }
+ } finally {
+ mLock.unlock()
+ }
+ }
+
+ @DoNotStrip private external fun loadNative(): Int
+
+ companion object {
+ const val MODEL_TYPE_TEXT = 1
+ const val MODEL_TYPE_TEXT_VISION = 2
+ const val MODEL_TYPE_MULTIMODAL = 2
+
+ private const val DEFAULT_SEQ_LEN = 128
+ private const val DEFAULT_ECHO = true
+ private const val DEFAULT_TEMPERATURE = -1.0f
+ private const val DEFAULT_BOS = 0
+ private const val DEFAULT_EOS = 0
+ private const val DEFAULT_LOAD_MODE = LlmModuleConfig.LOAD_MODE_MMAP
+
+ @DoNotStrip
+ @JvmStatic
+ private external fun initHybrid(
+ modelType: Int,
+ modulePath: String,
+ tokenizerPath: String,
+ temperature: Float,
+ dataFiles: List,
+ numBos: Int,
+ numEos: Int,
+ loadMode: Int,
+ ): HybridData
+ }
+}
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.java
deleted file mode 100644
index feb52a2b34b..00000000000
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.java
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package org.pytorch.executorch.extension.llm;
-
-/**
- * Configuration class for initializing a LlmModule.
- *
- * {@link #create()} method and the fluent builder pattern.
- */
-public class LlmModuleConfig {
- private final String modulePath;
- private final String tokenizerPath;
- private final float temperature;
- private final String dataPath;
- private final int modelType;
- private final int numBos;
- private final int numEos;
- private final int loadMode;
-
- /** Load entire model file into a buffer (no mmap). */
- public static final int LOAD_MODE_FILE = 0;
-
- /** Load model via mmap without mlock (default). Pages faulted in on demand. */
- public static final int LOAD_MODE_MMAP = 1;
-
- /** Load model via mmap and pin all pages with mlock. */
- public static final int LOAD_MODE_MMAP_USE_MLOCK = 2;
-
- /** Load model via mmap and attempt mlock, ignoring mlock failures. */
- public static final int LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS = 3;
-
- private LlmModuleConfig(Builder builder) {
- this.modulePath = builder.modulePath;
- this.tokenizerPath = builder.tokenizerPath;
- this.temperature = builder.temperature;
- this.dataPath = builder.dataPath;
- this.modelType = builder.modelType;
- this.numBos = builder.numBos;
- this.numEos = builder.numEos;
- this.loadMode = builder.loadMode;
- }
-
- /** Model type constant for text-only models. */
- public static final int MODEL_TYPE_TEXT = 1;
-
- /** Model type constant for text-and-vision multimodal models. */
- public static final int MODEL_TYPE_TEXT_VISION = 2;
-
- /** Model type constant for generic multimodal models. */
- public static final int MODEL_TYPE_MULTIMODAL = 2;
-
- /**
- * Creates a new Builder instance for constructing LlmModuleConfig objects.
- *
- * @return a new Builder instance with default configuration values
- */
- public static Builder create() {
- return new Builder();
- }
-
- // Getters with documentation
- /**
- * @return Path to the compiled model module (.pte file)
- */
- public String getModulePath() {
- return modulePath;
- }
-
- /**
- * @return Path to the tokenizer file or directory
- */
- public String getTokenizerPath() {
- return tokenizerPath;
- }
-
- /**
- * @return Temperature value for sampling (higher = more random)
- */
- public float getTemperature() {
- return temperature;
- }
-
- /**
- * @return Optional path to additional data files
- */
- public String getDataPath() {
- return dataPath;
- }
-
- /**
- * @return Type of model (text-only or text-vision)
- */
- public int getModelType() {
- return modelType;
- }
-
- /**
- * @return Number of BOS tokens to prepend
- */
- public int getNumBos() {
- return numBos;
- }
-
- /**
- * @return Number of EOS tokens to append
- */
- public int getNumEos() {
- return numEos;
- }
-
- /**
- * @return Load mode for the model file (one of LOAD_MODE_* constants)
- */
- public int getLoadMode() {
- return loadMode;
- }
-
- /**
- * Builder class for constructing LlmModuleConfig instances with optional parameters.
- *
- *
The builder provides a fluent interface for configuring model parameters and validates
- * required fields before construction.
- */
- public static class Builder {
- private String modulePath;
- private String tokenizerPath;
- private float temperature = 0.8f;
- private String dataPath = "";
- private int modelType = MODEL_TYPE_TEXT;
- private int numBos = 0;
- private int numEos = 0;
- private int loadMode = LOAD_MODE_MMAP;
-
- Builder() {}
-
- /**
- * Sets the path to the module.
- *
- * @param modulePath Path to module
- * @return This builder instance for method chaining
- */
- public Builder modulePath(String modulePath) {
- this.modulePath = modulePath;
- return this;
- }
-
- /**
- * Sets the path to the tokenizer.
- *
- * @param tokenizerPath Path to tokenizer
- * @return This builder instance for method chaining
- */
- public Builder tokenizerPath(String tokenizerPath) {
- this.tokenizerPath = tokenizerPath;
- return this;
- }
-
- /**
- * Sets the temperature for sampling generation.
- *
- * @param temperature Temperature value (typical range 0.0-1.0)
- * @return This builder instance for method chaining
- */
- public Builder temperature(float temperature) {
- this.temperature = temperature;
- return this;
- }
-
- /**
- * Sets the path to optional additional data files.
- *
- * @param dataPath Path to supplementary data resources
- * @return This builder instance for method chaining
- */
- public Builder dataPath(String dataPath) {
- this.dataPath = dataPath;
- return this;
- }
-
- /**
- * Sets the model type (text-only or multimodal).
- *
- * @param modelType One of MODEL_TYPE_TEXT, MODEL_TYPE_TEXT_VISION, MODEL_TYPE_MULTIMODAL
- * @return This builder instance for method chaining
- */
- public Builder modelType(int modelType) {
- this.modelType = modelType;
- return this;
- }
-
- /**
- * Sets the number of BOS tokens to prepend.
- *
- * @param numBos number of BOS tokens
- * @return This builder instance for method chaining
- */
- public Builder numBos(int numBos) {
- this.numBos = numBos;
- return this;
- }
-
- /**
- * Sets the number of EOS tokens to append.
- *
- * @param numEos number of EOS tokens
- * @return This builder instance for method chaining
- */
- public Builder numEos(int numEos) {
- this.numEos = numEos;
- return this;
- }
-
- /**
- * Sets the load mode for the model file. Defaults to {@link #LOAD_MODE_MMAP} (mmap without
- * mlock), which avoids pinning model pages in RAM.
- *
- * @param loadMode One of LOAD_MODE_FILE, LOAD_MODE_MMAP, LOAD_MODE_MMAP_USE_MLOCK,
- * LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS
- * @return This builder instance for method chaining
- * @throws IllegalArgumentException if {@code loadMode} is not one of the supported constants
- */
- public Builder loadMode(int loadMode) {
- if (loadMode != LOAD_MODE_FILE
- && loadMode != LOAD_MODE_MMAP
- && loadMode != LOAD_MODE_MMAP_USE_MLOCK
- && loadMode != LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS) {
- throw new IllegalArgumentException("Unknown load mode: " + loadMode);
- }
- this.loadMode = loadMode;
- return this;
- }
-
- /**
- * Constructs the LlmModuleConfig instance with validated parameters.
- *
- * @return New LlmModuleConfig instance with configured values
- * @throws IllegalArgumentException if required fields are missing
- */
- public LlmModuleConfig build() {
- if (modulePath == null || tokenizerPath == null) {
- throw new IllegalArgumentException("Module path and tokenizer path are required");
- }
- return new LlmModuleConfig(this);
- }
- }
-}
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.kt
new file mode 100644
index 00000000000..2d65633bb9f
--- /dev/null
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.kt
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.executorch.extension.llm
+
+/**
+ * Configuration class for initializing a LlmModule.
+ *
+ * Use [create] method and the fluent builder pattern.
+ */
+class LlmModuleConfig
+private constructor(
+ val modulePath: String,
+ val tokenizerPath: String,
+ val temperature: Float,
+ val dataPath: String?,
+ val modelType: Int,
+ val numBos: Int,
+ val numEos: Int,
+ val loadMode: Int,
+) {
+
+ companion object {
+ /** Load entire model file into a buffer (no mmap). */
+ const val LOAD_MODE_FILE = 0
+
+ /** Load model via mmap without mlock (default). Pages faulted in on demand. */
+ const val LOAD_MODE_MMAP = 1
+
+ /** Load model via mmap and pin all pages with mlock. */
+ const val LOAD_MODE_MMAP_USE_MLOCK = 2
+
+ /** Load model via mmap and attempt mlock, ignoring mlock failures. */
+ const val LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS = 3
+
+ /** Model type constant for text-only models. */
+ const val MODEL_TYPE_TEXT = 1
+
+ /** Model type constant for text-and-vision multimodal models. */
+ const val MODEL_TYPE_TEXT_VISION = 2
+
+ /** Model type constant for generic multimodal models. */
+ const val MODEL_TYPE_MULTIMODAL = 2
+
+ /**
+ * Creates a new Builder instance for constructing LlmModuleConfig objects.
+ *
+ * @return a new Builder instance with default configuration values
+ */
+ @JvmStatic fun create(): Builder = Builder()
+ }
+
+ /**
+ * Builder class for constructing LlmModuleConfig instances with optional parameters.
+ *
+ * The builder provides a fluent interface for configuring model parameters and validates required
+ * fields before construction.
+ */
+ class Builder internal constructor() {
+ private var modulePath: String? = null
+ private var tokenizerPath: String? = null
+ private var temperature: Float = 0.8f
+ private var dataPath: String? = ""
+ private var modelType: Int = MODEL_TYPE_TEXT
+ private var numBos: Int = 0
+ private var numEos: Int = 0
+ private var loadMode: Int = LOAD_MODE_MMAP
+
+ /** Sets the path to the module. */
+ fun modulePath(modulePath: String): Builder = apply { this.modulePath = modulePath }
+
+ /** Sets the path to the tokenizer. */
+ fun tokenizerPath(tokenizerPath: String): Builder = apply { this.tokenizerPath = tokenizerPath }
+
+ /** Sets the temperature for sampling generation. */
+ fun temperature(temperature: Float): Builder = apply { this.temperature = temperature }
+
+ /** Sets the path to optional additional data files. */
+ fun dataPath(dataPath: String?): Builder = apply { this.dataPath = dataPath }
+
+ /** Sets the model type (text-only or multimodal). */
+ fun modelType(modelType: Int): Builder = apply { this.modelType = modelType }
+
+ /** Sets the number of BOS tokens to prepend. */
+ fun numBos(numBos: Int): Builder = apply { this.numBos = numBos }
+
+ /** Sets the number of EOS tokens to append. */
+ fun numEos(numEos: Int): Builder = apply { this.numEos = numEos }
+
+ /**
+ * Sets the load mode for the model file. Defaults to [LOAD_MODE_MMAP] (mmap without mlock),
+ * which avoids pinning model pages in RAM.
+ *
+ * @throws IllegalArgumentException if loadMode is not one of the supported constants
+ */
+ fun loadMode(loadMode: Int): Builder {
+ require(
+ loadMode == LOAD_MODE_FILE ||
+ loadMode == LOAD_MODE_MMAP ||
+ loadMode == LOAD_MODE_MMAP_USE_MLOCK ||
+ loadMode == LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS
+ ) {
+ "Unknown load mode: $loadMode"
+ }
+ return apply { this.loadMode = loadMode }
+ }
+
+ /**
+ * Constructs the LlmModuleConfig instance with validated parameters.
+ *
+ * @throws IllegalArgumentException if required fields are missing
+ */
+ fun build(): LlmModuleConfig {
+ require(modulePath != null && tokenizerPath != null) {
+ "Module path and tokenizer path are required"
+ }
+ return LlmModuleConfig(
+ modulePath!!,
+ tokenizerPath!!,
+ temperature,
+ dataPath,
+ modelType,
+ numBos,
+ numEos,
+ loadMode,
+ )
+ }
+ }
+}
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/package-info.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/package-info.java
deleted file mode 100644
index 86e19d09133..00000000000
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/package-info.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * ExecuTorch LLM extension for Android.
- *
- *
This package provides Java bindings for running large language models (LLMs) on Android using
- * ExecuTorch. It supports text generation, tokenization, and streaming token callbacks.
- *
- *
Quick Start
- *
- * {@code
- * import org.pytorch.executorch.extension.llm.LlmModule;
- *
- * // Load a Llama model
- * LlmModule llm = new LlmModule(
- * "/data/local/tmp/llama.pte",
- * "/data/local/tmp/tokenizer.bin",
- * 0.8f
- * );
- * llm.load();
- *
- * // Generate text token by token
- * llm.generate("Hello, my name is", 200, new LlmCallback() {
- * public void onResult(String token) {
- * System.out.print(token);
- * }
- * public void onStats(String stats) {
- * System.out.println("\nStats: " + stats);
- * }
- * });
- * }
- *
- * Key Classes
- *
- *
- * - {@link org.pytorch.executorch.extension.llm.LlmModule} — load and run an LLM
- *
- {@link org.pytorch.executorch.extension.llm.LlmModuleConfig} — configure model paths and
- * settings
- *
- {@link org.pytorch.executorch.extension.llm.LlmGenerationConfig} — control generation
- * (temperature, seq length)
- *
- *
- * More Resources
- *
- *
- */
-package org.pytorch.executorch.extension.llm;
From 6bda6c490ed8c2e2ac02049725b9a454dc92ec07 Mon Sep 17 00:00:00 2001
From: Gregory Comer
Date: Fri, 22 May 2026 18:25:34 -0700
Subject: [PATCH 02/91] Globally serialize XNNPACK execution, add logging
(#19742)
Differential Revision: D106123930
Pull Request resolved: https://github.com/pytorch/executorch/pull/19742
---
backends/xnnpack/runtime/XNNPACKBackend.cpp | 53 ++++++++++++++++++++-
1 file changed, 51 insertions(+), 2 deletions(-)
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
index c20fa985f46..2fe1e4d162e 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.cpp
+++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -16,6 +16,7 @@
#include
#include
+#include
#include
#include
@@ -41,6 +42,13 @@ using executorch::runtime::FreeableBuffer;
using executorch::runtime::Result;
using executorch::runtime::Span;
+// Global mutex for all XNNPACK operations. This is temporary, tracked by
+// T272407942.
+static std::mutex& global_xnnpack_mutex() {
+ static std::mutex m;
+ return m;
+}
+
class XnnpackBackend final
: public ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface {
public:
@@ -66,6 +74,8 @@ class XnnpackBackend final
BackendInitContext& context,
FreeableBuffer* processed,
ArrayRef compile_specs) const override {
+ const std::lock_guard global_lock(global_xnnpack_mutex());
+
auto executor = context.get_runtime_allocator()
->allocateInstance();
if (executor == nullptr) {
@@ -129,6 +139,17 @@ class XnnpackBackend final
Error, "XNNCompiler::compileModel failed: 0x%x", (unsigned int)err);
return err;
}
+
+ ET_LOG(
+ Info,
+ "XnnpackBackend::init delegate=%p workspace_id=%" PRIu64
+ " workspace_ptr=%p program_id=0x%" PRIxPTR " weight_cache=%s",
+ (void*)executor,
+ workspace->id(),
+ (void*)workspace_ptr,
+ program_id,
+ use_weight_cache ? "true" : "false");
+
return executor;
}
@@ -136,15 +157,27 @@ class XnnpackBackend final
BackendExecutionContext& context,
DelegateHandle* handle,
Span args) const override {
+ const std::lock_guard global_lock(global_xnnpack_mutex());
+
auto executor = static_cast(handle);
+ auto workspace = executor->get_workspace();
+ ET_LOG(
+ Info,
+ "XnnpackBackend::execute begin delegate=%p workspace_id=%" PRIu64
+ " num_args=%zu weight_cache=%s",
+ (void*)executor,
+ workspace->id(),
+ (size_t)args.size(),
+ executor->uses_weight_cache() ? "true" : "false");
+
std::unique_lock lock_weights_cache(
weights_cache_mutex_, std::defer_lock);
if (executor->uses_weight_cache()) {
lock_weights_cache.lock();
}
- auto [raii_lock, _] = executor->get_workspace()->acquire();
+ auto [raii_lock, _] = workspace->acquire();
// Prepare Inputs/Outputs and Propagate Input Shapes
Error err = executor->prepare_args(args);
@@ -161,12 +194,29 @@ class XnnpackBackend final
// Convert output data types if necessary (e.g., int32 -> int64 for Long)
err = executor->convert_outputs(args);
+ ET_LOG(
+ Info,
+ "XnnpackBackend::execute end delegate=%p workspace_id=%" PRIu64
+ " err=0x%x",
+ (void*)executor,
+ workspace->id(),
+ (unsigned int)err);
+
return err;
}
void destroy(DelegateHandle* handle) const override {
if (handle != nullptr) {
+ const std::lock_guard global_lock(global_xnnpack_mutex());
+
auto executor = static_cast(handle);
+ auto workspace = executor->get_workspace();
+
+ ET_LOG(
+ Info,
+ "XnnpackBackend::destroy delegate=%p workspace_id=%" PRIu64,
+ (void*)executor,
+ workspace->id());
#ifdef ENABLE_XNNPACK_PROFILING
executor->print_avg_op_timings();
@@ -183,7 +233,6 @@ class XnnpackBackend final
// the same backend instance. Make sure to hold onto the workspace
// shared_ptr, as the pointer in the executor is freed, which includes
// the mutex referenced by raii_lock.
- auto workspace = executor->get_workspace();
auto [raii_lock, _] = workspace->acquire();
// XNNExecutor is not trivially destructible. Since this was constructed
From 12f62f2eb869eddbe4c612efe3f957bfc965aff0 Mon Sep 17 00:00:00 2001
From: Gasoonjia
Date: Fri, 22 May 2026 20:48:11 -0700
Subject: [PATCH 03/91] [ET Device Support] Module: allocate device memory for
planned buffers (#19746)
https://github.com/pytorch/executorch/pull/18476 clone version due to
bot crash
---
extension/module/module.cpp | 78 ++++++-
extension/module/module.h | 9 +
extension/module/targets.bzl | 1 +
.../module/test/module_device_memory_test.cpp | 218 ++++++++++++++++++
extension/module/test/targets.bzl | 22 +-
.../executorch/build/build_variables.bzl | 2 +
test/models/targets.bzl | 1 +
7 files changed, 328 insertions(+), 3 deletions(-)
create mode 100644 extension/module/test/module_device_memory_test.cpp
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 5422fb15b71..11fea031603 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -13,6 +13,7 @@
#include
#include
#include
+#include
#include
namespace executorch {
@@ -367,6 +368,51 @@ Module::make_planned_memory_with_shared_arenas(
return planned;
}
+std::unique_ptr Module::make_planned_memory_with_devices(
+ const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta) {
+ auto planned = std::make_unique();
+ const size_t num_buffers = method_meta.num_memory_planned_buffers();
+ planned->planned_buffers.reserve(num_buffers);
+ planned->planned_spans.reserve(num_buffers);
+ planned->device_buffers.reserve(num_buffers);
+ planned->planned_devices.reserve(num_buffers);
+
+ for (size_t i = 0; i < num_buffers; ++i) {
+ auto size = method_meta.memory_planned_buffer_size(i);
+ ET_CHECK_MSG(size.ok(), "Failed to get buffer size for index %zu", i);
+ auto device = method_meta.memory_planned_buffer_device(i);
+ ET_CHECK_MSG(device.ok(), "Failed to get buffer device for index %zu", i);
+ planned->planned_devices.push_back(device.get());
+
+ if (device->is_cpu()) {
+ planned->planned_buffers.emplace_back(size.get());
+ planned->planned_spans.emplace_back(
+ planned->planned_buffers.back().data(), size.get());
+ } else {
+ // Allocate device memory via DeviceAllocator and store the RAII buffer.
+ planned->planned_buffers.emplace_back(); // empty CPU placeholder
+ auto dmb = runtime::DeviceMemoryBuffer::create(
+ size.get(), device->type(), device->index());
+ ET_CHECK_MSG(
+ dmb.ok(),
+ "Failed to allocate device memory for buffer %zu (device_type=%d)",
+ i,
+ static_cast(device->type()));
+ planned->planned_spans.emplace_back(dmb->as_span());
+ planned->device_buffers.push_back(std::move(dmb.get()));
+ }
+ }
+
+ // HierarchicalAllocator owns the per-buffer Device metadata so the
+ // MemoryManager can later expose it via planned_buffer_devices().
+ planned->planned_memory = std::make_unique(
+ runtime::Span>(
+ planned->planned_spans.data(), planned->planned_spans.size()),
+ runtime::Span(
+ planned->planned_devices.data(), planned->planned_devices.size()));
+ return planned;
+}
+
runtime::Result> Module::get_mem_planned_buffer_sizes(
const std::string& method_name) {
auto meta_res = program_->method_meta(method_name.c_str());
@@ -422,10 +468,38 @@ runtime::Error Module::load_method(
MethodHolder method_holder;
if (!planned_memory) {
- if (!share_memory_arenas_) {
+ // Check if any buffers need device memory allocation.
+ auto meta_res = program_->method_meta(method_name.c_str());
+ ET_CHECK_OK_OR_RETURN_ERROR(meta_res.error());
+ auto& meta = meta_res.get();
+
+ bool has_device_buffers = false;
+ for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) {
+ auto dev = meta.memory_planned_buffer_device(i);
+ if (dev.ok() && !dev->is_cpu()) {
+ has_device_buffers = true;
+ break;
+ }
+ }
+
+ if (has_device_buffers) {
+ // Device memory with shared arenas is not yet supported.
+ ET_CHECK_OR_RETURN_ERROR(
+ !share_memory_arenas_,
+ NotSupported,
+ "Device memory buffers are not yet compatible with "
+ "share_memory_arenas. Please disable share_memory_arenas "
+ "when using models with device-planned memory.");
+
+ // Device-aware path: allocate CPU and device buffers. The device
+ // span is owned by the HierarchicalAllocator inside PlannedMemory.
+ method_holder.planned_memory = make_planned_memory_with_devices(meta);
+ planned_memory = method_holder.planned_memory->planned_memory.get();
+ } else if (!share_memory_arenas_) {
auto sizes_res = get_mem_planned_buffer_sizes(method_name);
ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
method_holder.planned_memory = make_planned_memory(sizes_res.get());
+ planned_memory = method_holder.planned_memory->planned_memory.get();
} else {
auto sizes_res = get_mem_planned_buffer_sizes(method_name);
ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
@@ -442,8 +516,8 @@ runtime::Error Module::load_method(
}
method_holder.planned_memory =
make_planned_memory_with_shared_arenas(sizes, shared_arenas_);
+ planned_memory = method_holder.planned_memory->planned_memory.get();
}
- planned_memory = method_holder.planned_memory->planned_memory.get();
}
method_holder.memory_manager = std::make_unique(
diff --git a/extension/module/module.h b/extension/module/module.h
index 47ead23032e..91c7feaad9b 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -18,6 +18,8 @@
#include
#include
+#include
+
#ifdef USE_ATEN_LIB
#define ET_MODULE_NAMESPACE module::aten
#else // !USE_ATEN_LIB
@@ -716,6 +718,11 @@ class Module {
struct PlannedMemory {
std::vector> planned_buffers;
std::vector> planned_spans;
+ std::vector device_buffers;
+ /// Per-buffer Device (type + index) metadata used by
+ /// HierarchicalAllocator. Owns the storage backing the device span the
+ /// allocator references, so it must outlive `planned_memory`.
+ std::vector planned_devices;
std::unique_ptr planned_memory;
};
std::unique_ptr make_planned_memory(
@@ -723,6 +730,8 @@ class Module {
std::unique_ptr make_planned_memory_with_shared_arenas(
const std::vector& buffer_sizes,
std::vector>& shared_arenas);
+ std::unique_ptr make_planned_memory_with_devices(
+ const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta);
runtime::Result> get_mem_planned_buffer_sizes(
const std::string& method_name);
runtime::Result> get_max_mem_planned_buffer_sizes();
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index fa80203831a..e622b138ff6 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -30,6 +30,7 @@ def define_common_targets():
"//executorch/runtime/backend:backend_options",
"//executorch/runtime/backend:backend_options_map",
"//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
+ "//executorch/runtime/core:device_memory_buffer",
],
)
diff --git a/extension/module/test/module_device_memory_test.cpp b/extension/module/test/module_device_memory_test.cpp
new file mode 100644
index 00000000000..5031273ac2b
--- /dev/null
+++ b/extension/module/test/module_device_memory_test.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Tests that Module's device-aware memory allocation path works correctly.
+ *
+ * Uses ModuleAddWithDevice.pte which has:
+ * non_const_buffer_sizes: [0, 48] (1 buffer, index 0 reserved)
+ * non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}]
+ *
+ * Since we don't have a real CUDA backend, we test that:
+ * 1. CPU-only models load through Module without invoking device allocator
+ * 2. Device-annotated models trigger DeviceMemoryBuffer::create via a mock
+ */
+
+#include
+
+#include
+
+#include
+#include
+#include
+
+using executorch::extension::Module;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceMemoryBuffer;
+using executorch::runtime::Error;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+namespace {
+
+class MockCudaAllocator : public DeviceAllocator {
+ public:
+ Result allocate(
+ size_t nbytes,
+ DeviceIndex index,
+ size_t alignment = kDefaultAlignment) override {
+ (void)alignment;
+ allocate_count_++;
+ last_allocate_size_ = nbytes;
+ last_allocate_index_ = index;
+ buffer_ = std::make_unique(nbytes);
+ return static_cast(buffer_.get());
+ }
+
+ void deallocate(void* ptr, DeviceIndex index) override {
+ deallocate_count_++;
+ buffer_.reset();
+ }
+
+ Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
+ return Error::Ok;
+ }
+
+ Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
+ return Error::Ok;
+ }
+
+ DeviceType device_type() const override {
+ return DeviceType::CUDA;
+ }
+
+ int allocate_count_ = 0;
+ int deallocate_count_ = 0;
+ size_t last_allocate_size_ = 0;
+ DeviceIndex last_allocate_index_ = -1;
+
+ private:
+ std::unique_ptr buffer_;
+};
+
+} // namespace
+
+static MockCudaAllocator g_mock_cuda;
+
+class ModuleDeviceMemoryTest : public ::testing::Test {
+ protected:
+ static void SetUpTestSuite() {
+ executorch::runtime::runtime_init();
+ register_device_allocator(&g_mock_cuda);
+ }
+
+ void SetUp() override {
+ g_mock_cuda.allocate_count_ = 0;
+ g_mock_cuda.deallocate_count_ = 0;
+ g_mock_cuda.last_allocate_size_ = 0;
+ g_mock_cuda.last_allocate_index_ = -1;
+ }
+};
+
+TEST_F(ModuleDeviceMemoryTest, CpuOnlyModelDoesNotAllocateDeviceMemory) {
+ const char* path = std::getenv("ET_MODULE_ADD_PATH");
+ ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_PATH not set";
+
+ Module module(path);
+ auto err = module.load_method("forward");
+ ASSERT_EQ(err, Error::Ok);
+
+ EXPECT_EQ(g_mock_cuda.allocate_count_, 0)
+ << "CPU-only model should not allocate device memory";
+}
+
+TEST_F(ModuleDeviceMemoryTest, DeviceMemoryBufferCreateCallsAllocator) {
+ // Directly test DeviceMemoryBuffer::create with the registered mock.
+ // This verifies the RAII allocation/deallocation path that Module uses.
+ {
+ auto result = DeviceMemoryBuffer::create(48, DeviceType::CUDA, 0);
+ ASSERT_TRUE(result.ok());
+ auto buf = std::move(result.get());
+
+ EXPECT_EQ(g_mock_cuda.allocate_count_, 1);
+ EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48);
+ EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0);
+ EXPECT_NE(buf.data(), nullptr);
+ EXPECT_EQ(buf.size(), 48);
+
+ // as_span() wraps the device pointer for HierarchicalAllocator.
+ auto span = buf.as_span();
+ EXPECT_EQ(span.data(), static_cast(buf.data()));
+ EXPECT_EQ(span.size(), 48);
+
+ EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+ }
+ // RAII deallocation on scope exit.
+ EXPECT_EQ(g_mock_cuda.deallocate_count_, 1);
+}
+
+TEST_F(ModuleDeviceMemoryTest, DeviceModelMethodMetaReportsCudaBuffer) {
+ // Verify MethodMeta reports the correct device for buffers in the
+ // device-annotated model, without needing to load the full method.
+ const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+ ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+ Module module(path);
+ auto err = module.load();
+ ASSERT_EQ(err, Error::Ok);
+
+ auto meta = module.method_meta("forward");
+ ASSERT_TRUE(meta.ok());
+
+ // ModuleAddWithDevice has 1 planned buffer (48 bytes) on CUDA.
+ ASSERT_EQ(meta->num_memory_planned_buffers(), 1);
+
+ auto size = meta->memory_planned_buffer_size(0);
+ ASSERT_TRUE(size.ok());
+ EXPECT_EQ(size.get(), 48);
+
+ auto device = meta->memory_planned_buffer_device(0);
+ ASSERT_TRUE(device.ok());
+ EXPECT_EQ(device->type(), DeviceType::CUDA);
+ EXPECT_EQ(device->index(), 0);
+}
+
+TEST_F(ModuleDeviceMemoryTest, DeviceModelWithSharedArenasReturnsNotSupported) {
+ const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+ ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+ // share_memory_arenas = true with a device-annotated model should fail.
+ Module module(
+ path,
+ Module::LoadMode::File,
+ /*event_tracer=*/nullptr,
+ /*memory_allocator=*/nullptr,
+ /*temp_allocator=*/nullptr,
+ /*share_memory_arenas=*/true);
+
+ auto err = module.load_method("forward");
+ EXPECT_EQ(err, Error::NotSupported);
+}
+
+TEST_F(
+ ModuleDeviceMemoryTest,
+ LoadMethodAllocatesDeviceMemoryAndDeallocatesOnDestroy) {
+ const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+ ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+ {
+ Module module(path);
+ auto err = module.load_method("forward");
+
+ // Regardless of whether load_method succeeds or fails (e.g. due to
+ // backend init issues), the device-aware memory allocation path
+ // (make_planned_memory_with_devices) runs BEFORE backend init.
+ EXPECT_EQ(g_mock_cuda.allocate_count_, 1)
+ << "Expected 1 device allocation for the CUDA buffer"
+ << " (actual: " << g_mock_cuda.allocate_count_ << ")"
+ << ", deallocate_count=" << g_mock_cuda.deallocate_count_
+ << ", load_method returned error=" << static_cast(err);
+ EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48)
+ << "Expected 48 bytes allocated (3 CUDA tensors sharing one buffer)";
+ EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0)
+ << "Expected device_index=0 (cuda:0)";
+
+ if (err == Error::Ok) {
+ // Success path: MethodHolder moved into methods_ map.
+ // DeviceMemoryBuffer is alive as long as Module is alive.
+ EXPECT_EQ(g_mock_cuda.deallocate_count_, 0)
+ << "No deallocation while method is loaded";
+ } else {
+ // Error path: local MethodHolder destroyed on return from load_method.
+ // RAII deallocation already happened.
+ EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
+ << "RAII deallocation on error path";
+ }
+ }
+
+ // After Module destroyed, all device memory must be freed.
+ EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
+ << "Expected deallocation after Module destroyed";
+}
diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl
index f0d7e449efd..4dc3fb537f3 100644
--- a/extension/module/test/targets.bzl
+++ b/extension/module/test/targets.bzl
@@ -28,7 +28,7 @@ def define_common_targets(is_fbcode=False):
aten_suffix = ("_aten" if aten_mode else "")
runtime.cxx_test(
- name = "test" + aten_suffix,
+ name = "module_test" + aten_suffix,
srcs = [
"module_test.cpp",
],
@@ -68,6 +68,26 @@ def define_common_targets(is_fbcode=False):
],
)
+ runtime.cxx_test(
+ name = "module_device_memory_test" + aten_suffix,
+ srcs = [
+ "module_device_memory_test.cpp",
+ ],
+ deps = [
+ "//executorch/kernels/portable:generated_lib" + aten_suffix,
+ "//executorch/extension/module:module" + aten_suffix,
+ "//executorch/runtime/core:device_allocator",
+ "//executorch/runtime/core:device_memory_buffer",
+ ],
+ env = {
+ "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])",
+ "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
+ },
+ compiler_flags = [
+ "-Wno-error=deprecated-declarations",
+ ],
+ )
+
runtime.filegroup(
name = "resources",
srcs = native.glob([
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
index b0545b8ce18..659a128994f 100644
--- a/shim_et/xplat/executorch/build/build_variables.bzl
+++ b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -50,6 +50,8 @@ PLATFORM_SRCS = [
EXECUTORCH_CORE_SRCS = sorted([
"runtime/backend/interface.cpp",
+ "runtime/core/device_allocator.cpp",
+ "runtime/core/device_memory_buffer.cpp",
"runtime/core/evalue.cpp",
"runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp",
"runtime/core/exec_aten/util/tensor_util_portable.cpp",
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index c9fb67b7d31..a80244b1383 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -226,6 +226,7 @@ def define_common_targets():
default_outs = ["."],
visibility = [
"//executorch/runtime/executor/test/...",
+ "//executorch/extension/module/test/...",
],
)
From c27cc5d5bb872603ec90378c486049bc2c77a382 Mon Sep 17 00:00:00 2001
From: Gasoonjia
Date: Fri, 22 May 2026 20:54:37 -0700
Subject: [PATCH 04/91] [ET Device Support] CudaAllocator: device memory
allocator for CUDA backend (#19747)
clone https://github.com/pytorch/executorch/pull/18477 due to bot crash
---
backends/aoti/slim/core/storage.h | 44 ++--
backends/aoti/slim/core/targets.bzl | 1 +
backends/cuda/runtime/TARGETS | 29 +++
backends/cuda/runtime/cuda_allocator.cpp | 258 +++++++++++++++++++++++
backends/cuda/runtime/cuda_allocator.h | 84 ++++++++
backends/cuda/runtime/cuda_backend.cpp | 9 +
6 files changed, 395 insertions(+), 30 deletions(-)
create mode 100644 backends/cuda/runtime/cuda_allocator.cpp
create mode 100644 backends/cuda/runtime/cuda_allocator.h
diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h
index 73c4d32d955..a3d17a89903 100644
--- a/backends/aoti/slim/core/storage.h
+++ b/backends/aoti/slim/core/storage.h
@@ -13,6 +13,7 @@
#ifdef CUDA_AVAILABLE
#include
#include
+#include
#endif
#include
@@ -107,9 +108,6 @@ struct DeviceTraits {
/// @param device The target CUDA device (used to get the stream).
/// @return Pointer to allocated device memory.
static void* allocate(size_t nbytes, const c10::Device& device) {
- // Get the current stream for this device (set by CUDAStreamGuard if any)
- // This follows PyTorch's pattern where the allocator assumes the caller
- // has already set the correct device via CUDAStreamGuard.
auto stream_result =
executorch::backends::cuda::getCurrentCUDAStream(device.index());
ET_CHECK_MSG(
@@ -118,31 +116,23 @@ struct DeviceTraits {
static_cast(device.index()));
cudaStream_t stream = stream_result.get();
- void* data = nullptr;
- ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream));
- return data;
+ auto result = executorch::backends::cuda::CudaAllocator::allocate_async(
+ nbytes, device.index(), stream);
+ ET_CHECK_MSG(
+ result.ok(),
+ "CudaAllocator::allocate_async failed for %zu bytes on device %d",
+ nbytes,
+ static_cast(device.index()));
+ return result.get();
}
- /// Frees CUDA device memory on the current stream.
- /// @param ptr Pointer to device memory to free.
static void free(void* ptr) {
- // Get the current stream for the current device
- // Currently all cuda slimtensors should be on the same device same stream,
- // so we can just use the stream on current device.
- // TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
- // support multiple devices.
auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
- ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
+ executorch::backends::cuda::CudaAllocator::deallocate_async(
+ ptr, -1, stream_result.get());
}
- /// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
- /// @param dst Destination pointer.
- /// @param src Source pointer.
- /// @param nbytes Number of bytes to copy.
- /// @param dst_device Destination device.
- /// @param src_device Source device.
- /// @param stream CUDA stream for async copy.
static void memcpy_async(
void* dst,
const void* src,
@@ -151,7 +141,6 @@ struct DeviceTraits {
const c10::Device& src_device,
cudaStream_t stream) {
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
-
if (src_device.is_cpu()) {
direction = cudaMemcpyHostToDevice;
} else if (dst_device.is_cpu()) {
@@ -164,15 +153,11 @@ struct DeviceTraits {
static_cast(dst_device.index()));
}
- ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
+ auto err = executorch::backends::cuda::CudaAllocator::memcpy_async(
+ dst, src, nbytes, direction, stream);
+ ET_CHECK_MSG(err == executorch::runtime::Error::Ok, "memcpy_async failed");
}
- /// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
- /// @param dst Destination pointer.
- /// @param src Source pointer.
- /// @param nbytes Number of bytes to copy.
- /// @param dst_device Destination device.
- /// @param src_device Source device.
static void memcpy(
void* dst,
const void* src,
@@ -180,7 +165,6 @@ struct DeviceTraits {
const c10::Device& dst_device,
const c10::Device& src_device) {
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
-
if (src_device.is_cpu()) {
direction = cudaMemcpyHostToDevice;
} else if (dst_device.is_cpu()) {
diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl
index b9148305c91..42a7b79da6e 100644
--- a/backends/aoti/slim/core/targets.bzl
+++ b/backends/aoti/slim/core/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
"//executorch/runtime/platform:platform",
"//executorch/backends/aoti/slim/c10/cuda:exception",
"//executorch/backends/aoti/slim/cuda:guard",
+ "//executorch/backends/cuda/runtime:cuda_allocator",
],
)
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index f13f41ab8b7..c8449a95718 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -74,6 +74,33 @@ runtime.cxx_library(
],
)
+runtime.cxx_library(
+ name = "cuda_allocator",
+ srcs = [
+ "cuda_allocator.cpp",
+ ],
+ headers = [
+ "cuda_allocator.h",
+ ],
+ # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+ link_whole = True,
+ supports_python_dlopen = True,
+ visibility = ["PUBLIC"],
+ exported_deps = [
+ "//executorch/runtime/core:device_allocator",
+ ],
+ deps = [
+ "//executorch/runtime/platform:platform",
+ ],
+ nvcc_flags = get_nvcc_arch_args() + [
+ "-_NVCC_HOST_COMPILER_FLAG_",
+ "gcc",
+ ],
+ external_deps = [
+ ("cuda", None, "cuda-lazy"),
+ ],
+)
+
runtime.cxx_library(
name = "cuda_backend",
srcs = [
@@ -92,6 +119,8 @@ runtime.cxx_library(
deps = [
":cuda_platform",
":runtime_shims",
+ ":cuda_allocator",
+ ":cuda_platform",
"//executorch/backends/aoti:aoti_common_slim",
"//executorch/backends/aoti/slim/core:slimtensor",
"//executorch/backends/aoti/slim/factory:empty",
diff --git a/backends/cuda/runtime/cuda_allocator.cpp b/backends/cuda/runtime/cuda_allocator.cpp
new file mode 100644
index 00000000000..94294b08fa0
--- /dev/null
+++ b/backends/cuda/runtime/cuda_allocator.cpp
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include
+
+#include
+
+#include
+
+namespace executorch::backends::cuda {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+Result
+CudaAllocator::allocate(size_t nbytes, DeviceIndex index, size_t alignment) {
+ // index == -1 means "use the current CUDA device"; any value < -1 is invalid.
+ ET_CHECK_OR_RETURN_ERROR(
+ index >= -1,
+ InvalidArgument,
+ "CudaAllocator::allocate: invalid device index %d (must be >= -1)",
+ static_cast(index));
+
+ // Alignment must be a non-zero power of 2.
+ ET_CHECK_OR_RETURN_ERROR(
+ alignment != 0 && (alignment & (alignment - 1)) == 0,
+ InvalidArgument,
+ "CudaAllocator::allocate: alignment must be a power of 2, got %zu",
+ alignment);
+
+ // cudaMalloc is documented to return memory aligned to at least 256 bytes,
+ // which trivially satisfies kDefaultAlignment (alignof(void*)). For any
+ // requested alignment <= 256 bytes, the returned pointer is already aligned.
+ // Stricter alignment would require over-allocation plus bookkeeping that
+ // deallocate() does not currently support, so reject that case.
+ constexpr size_t kCudaMallocAlignment = 256;
+ ET_CHECK_OR_RETURN_ERROR(
+ alignment <= kCudaMallocAlignment,
+ NotSupported,
+ "CudaAllocator::allocate: requested alignment %zu exceeds cudaMalloc's "
+ "guaranteed alignment of %zu bytes; stricter alignment is not supported",
+ alignment,
+ kCudaMallocAlignment);
+
+ void* ptr = nullptr;
+ int prev_device = 0;
+ cudaError_t prev_device_err = cudaGetDevice(&prev_device);
+
+ // If index == -1, fall back to the current device returned by cudaGetDevice
+ // and skip the set/restore round-trip.
+ const bool switch_device = index >= 0 && prev_device_err == cudaSuccess &&
+ static_cast(index) != prev_device;
+ if (switch_device) {
+ cudaSetDevice(index);
+ }
+
+ cudaError_t err = cudaMalloc(&ptr, nbytes);
+
+ if (switch_device) {
+ cudaSetDevice(prev_device);
+ }
+
+ if (err != cudaSuccess) {
+ ET_LOG(
+ Error,
+ "cudaMalloc failed: %s (requested %zu bytes on device %d)",
+ cudaGetErrorString(err),
+ nbytes,
+ static_cast(index));
+ return Error::MemoryAllocationFailed;
+ }
+
+ // Sanity check: the pointer returned by cudaMalloc should already meet the
+ // requested alignment. If a future CUDA runtime weakens this guarantee, we
+ // want to fail loudly rather than silently return a misaligned pointer.
+ if ((reinterpret_cast(ptr) & (alignment - 1)) != 0) {
+ ET_LOG(
+ Error,
+ "cudaMalloc returned pointer %p not aligned to %zu bytes",
+ ptr,
+ alignment);
+ cudaFree(ptr);
+ return Error::MemoryAllocationFailed;
+ }
+
+ return ptr;
+}
+
+void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
+ if (ptr == nullptr) {
+ return;
+ }
+
+ int prev_device = 0;
+ cudaError_t prev_device_err = cudaSuccess;
+
+ if (index >= 0) {
+ prev_device_err = cudaGetDevice(&prev_device);
+ if (prev_device_err == cudaSuccess) {
+ cudaSetDevice(index);
+ }
+ }
+
+ cudaError_t err = cudaFree(ptr);
+
+ if (index >= 0 && prev_device_err == cudaSuccess) {
+ cudaSetDevice(prev_device);
+ }
+
+ if (err != cudaSuccess) {
+ ET_LOG(
+ Error,
+ "cudaFree failed: %s (ptr=%p, device %d)",
+ cudaGetErrorString(err),
+ ptr,
+ static_cast(index));
+ }
+}
+
+// TODO(gasoonjia): Add support for async copy
+Error CudaAllocator::copy_host_to_device(
+ void* dst,
+ const void* src,
+ size_t nbytes,
+ DeviceIndex index) {
+ int prev_device = 0;
+ cudaError_t prev_device_err = cudaSuccess;
+
+ if (index >= 0) {
+ prev_device_err = cudaGetDevice(&prev_device);
+ if (prev_device_err == cudaSuccess) {
+ cudaSetDevice(index);
+ }
+ }
+
+ cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
+
+ if (index >= 0 && prev_device_err == cudaSuccess) {
+ cudaSetDevice(prev_device);
+ }
+
+ if (err != cudaSuccess) {
+ ET_LOG(
+ Error,
+ "cudaMemcpy H2D failed: %s (%zu bytes, device %d)",
+ cudaGetErrorString(err),
+ nbytes,
+ static_cast(index));
+ return Error::Internal;
+ }
+ return Error::Ok;
+}
+
+// TODO(gasoonjia): Add support for async copy
+Error CudaAllocator::copy_device_to_host(
+ void* dst,
+ const void* src,
+ size_t nbytes,
+ DeviceIndex index) {
+ int prev_device = 0;
+ cudaError_t prev_device_err = cudaSuccess;
+
+ if (index >= 0) {
+ prev_device_err = cudaGetDevice(&prev_device);
+ if (prev_device_err == cudaSuccess) {
+ cudaSetDevice(index);
+ }
+ }
+
+ cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
+
+ if (index >= 0 && prev_device_err == cudaSuccess) {
+ cudaSetDevice(prev_device);
+ }
+
+ if (err != cudaSuccess) {
+ ET_LOG(
+ Error,
+ "cudaMemcpy D2H failed: %s (%zu bytes, device %d)",
+ cudaGetErrorString(err),
+ nbytes,
+ static_cast(index));
+ return Error::Internal;
+ }
+ return Error::Ok;
+}
+
+DeviceType CudaAllocator::device_type() const {
+ return DeviceType::CUDA;
+}
+
+CudaAllocator& CudaAllocator::instance() {
+ static CudaAllocator allocator;
+ return allocator;
+}
+
+Result CudaAllocator::allocate_async(
+ size_t nbytes,
+ DeviceIndex index,
+ cudaStream_t stream) {
+ void* ptr = nullptr;
+ cudaError_t err = cudaMallocAsync(&ptr, nbytes, stream);
+ if (err != cudaSuccess) {
+ ET_LOG(
+ Error,
+ "cudaMallocAsync failed: %s (requested %zu bytes on device %d)",
+ cudaGetErrorString(err),
+ nbytes,
+ static_cast(index));
+ return Error::MemoryAllocationFailed;
+ }
+ return ptr;
+}
+
+void CudaAllocator::deallocate_async(
+ void* ptr,
+ DeviceIndex index,
+ cudaStream_t stream) {
+ if (ptr == nullptr) {
+ return;
+ }
+ cudaError_t err = cudaFreeAsync(ptr, stream);
+ if (err != cudaSuccess) {
+ ET_LOG(
+ Error,
+ "cudaFreeAsync failed: %s (ptr=%p, device %d)",
+ cudaGetErrorString(err),
+ ptr,
+ static_cast(index));
+ }
+}
+
+Error CudaAllocator::memcpy_async(
+ void* dst,
+ const void* src,
+ size_t nbytes,
+ cudaMemcpyKind direction,
+ cudaStream_t stream) {
+ cudaError_t err = cudaMemcpyAsync(dst, src, nbytes, direction, stream);
+ if (err != cudaSuccess) {
+ ET_LOG(
+ Error,
+ "cudaMemcpyAsync failed: %s (%zu bytes)",
+ cudaGetErrorString(err),
+ nbytes);
+ return Error::Internal;
+ }
+ return Error::Ok;
+}
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/cuda_allocator.h b/backends/cuda/runtime/cuda_allocator.h
new file mode 100644
index 00000000000..fcd8224305a
--- /dev/null
+++ b/backends/cuda/runtime/cuda_allocator.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include
+
+#include
+
+namespace executorch::backends::cuda {
+
+/**
+ * CUDA implementation of DeviceAllocator.
+ *
+ * Uses cudaMalloc/cudaFree for allocation and cudaMemcpy for host-device
+ * transfers. This allocator is automatically registered as a singleton
+ * with the DeviceAllocatorRegistry when the CUDA backend library is linked.
+ *
+ * All CUDA memory operations in the CUDA backend should go through this
+ * allocator for consistent memory management.
+ */
+class CudaAllocator final : public executorch::runtime::DeviceAllocator {
+ public:
+ executorch::runtime::Result allocate(
+ size_t nbytes,
+ executorch::runtime::etensor::DeviceIndex index,
+ size_t alignment = kDefaultAlignment) override;
+
+ void deallocate(void* ptr, executorch::runtime::etensor::DeviceIndex index)
+ override;
+
+ executorch::runtime::Error copy_host_to_device(
+ void* dst,
+ const void* src,
+ size_t nbytes,
+ executorch::runtime::etensor::DeviceIndex index) override;
+
+ executorch::runtime::Error copy_device_to_host(
+ void* dst,
+ const void* src,
+ size_t nbytes,
+ executorch::runtime::etensor::DeviceIndex index) override;
+
+ executorch::runtime::etensor::DeviceType device_type() const override;
+
+ /// Returns the global CudaAllocator singleton.
+ static CudaAllocator& instance();
+
+ // --- Async (stream-based) operations for SlimTensor/Storage layer ---
+
+ /**
+ * Allocate device memory asynchronously on the given CUDA stream.
+ */
+ static executorch::runtime::Result allocate_async(
+ size_t nbytes,
+ executorch::runtime::etensor::DeviceIndex index,
+ cudaStream_t stream);
+
+ /**
+ * Deallocate device memory asynchronously on the given CUDA stream.
+ */
+ static void deallocate_async(
+ void* ptr,
+ executorch::runtime::etensor::DeviceIndex index,
+ cudaStream_t stream);
+
+ /**
+ * Copy memory asynchronously on the given CUDA stream.
+ * Supports H2D, D2H, and D2D based on src/dst device types.
+ */
+ static executorch::runtime::Error memcpy_async(
+ void* dst,
+ const void* src,
+ size_t nbytes,
+ cudaMemcpyKind direction,
+ cudaStream_t stream);
+};
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 1497ba1e376..d2738f7a976 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -40,6 +40,7 @@
// Include our shim layer headers
#include
#include
+#include
#include
#include
#include
@@ -1273,5 +1274,13 @@ auto cls = cuda::CudaBackend();
executorch::runtime::Backend backend{"CudaBackend", &cls};
static executorch::runtime::Error success_with_compiler =
register_backend(backend);
+
+// Auto-register the CudaAllocator so that DeviceMemoryBuffer::create(CUDA)
+// works whenever the CUDA backend library is linked.
+static bool cuda_allocator_registered = [] {
+ executorch::runtime::register_device_allocator(
+ &cuda::CudaAllocator::instance());
+ return true;
+}();
} // namespace
} // namespace executorch::backends
From 7d8063f9e6221ad8724f122ad3ec4cbb1aae2fc6 Mon Sep 17 00:00:00 2001
From: Gasoonjia
Date: Fri, 22 May 2026 20:56:14 -0700
Subject: [PATCH 05/91] [ET Device Support] Define AOT device copy ops registry
(#19748)
clone https://github.com/pytorch/executorch/pull/18728 due to bot crash
---
exir/passes/BUCK | 8 +++
exir/passes/_device_copy_ops_registry.py | 58 +++++++++++++++++++
exir/tests/TARGETS | 11 ++++
exir/tests/test_device_copy_ops.py | 73 ++++++++++++++++++++++++
4 files changed, 150 insertions(+)
create mode 100644 exir/passes/_device_copy_ops_registry.py
create mode 100644 exir/tests/test_device_copy_ops.py
diff --git a/exir/passes/BUCK b/exir/passes/BUCK
index 954f1cfdb4f..4647388b388 100644
--- a/exir/passes/BUCK
+++ b/exir/passes/BUCK
@@ -381,6 +381,14 @@ fbcode_target(_kind = runtime.python_library,
],
)
+fbcode_target(_kind = runtime.python_library,
+ name = "device_copy_ops_registry",
+ srcs = ["_device_copy_ops_registry.py"],
+ deps = [
+ "//caffe2:torch",
+ ],
+)
+
fbcode_target(_kind = runtime.python_library,
name = "memory_format_ops_pass",
srcs = [
diff --git a/exir/passes/_device_copy_ops_registry.py b/exir/passes/_device_copy_ops_registry.py
new file mode 100644
index 00000000000..a62b88d4234
--- /dev/null
+++ b/exir/passes/_device_copy_ops_registry.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Registry for device copy ops used to insert explicit H2D (host-to-device)
+and D2H (device-to-host) data transfer operations at delegate boundaries.
+
+These ops are inserted by PropagateDevicePass when enable_non_cpu_memory_planning
+is True, making the graph functional by explicitly transferring data between
+CPU and device memory.
+
+Follows the same registration pattern as dim_order_ops_registry.py.
+"""
+
+import torch
+from torch.library import impl, Library
+
+lib = Library("et_copy", "DEF")
+
+# _h2d_copy: copies a CPU tensor to device memory.
+# At tracing time, this is a clone (both on CPU). At runtime, the out tensor
+# is memory-planned on device, and the kernel calls
+# DeviceAllocator::copy_host_to_device.
+lib.define("_h2d_copy(Tensor self) -> Tensor")
+lib.define("_h2d_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+
+# _d2h_copy: copies a device tensor to CPU memory.
+# At tracing time, this is a clone (both on CPU). At runtime, the self tensor
+# has device memory, and the kernel calls DeviceAllocator::copy_device_to_host.
+lib.define("_d2h_copy(Tensor self) -> Tensor")
+lib.define("_d2h_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+
+
+@impl(lib, "_h2d_copy", "CompositeImplicitAutograd")
+def _h2d_copy_impl(self: torch.Tensor) -> torch.Tensor:
+ # During tracing, both tensors are on CPU. Just clone to represent the transfer.
+ return self.clone()
+
+
+@impl(lib, "_h2d_copy.out", "CompositeImplicitAutograd")
+def _h2d_copy_out_impl(self: torch.Tensor, *, out: torch.Tensor) -> torch.Tensor:
+ out.copy_(self)
+ return out
+
+
+@impl(lib, "_d2h_copy", "CompositeImplicitAutograd")
+def _d2h_copy_impl(self: torch.Tensor) -> torch.Tensor:
+ # During tracing, both tensors are on CPU. Just clone to represent the transfer.
+ return self.clone()
+
+
+@impl(lib, "_d2h_copy.out", "CompositeImplicitAutograd")
+def _d2h_copy_out_impl(self: torch.Tensor, *, out: torch.Tensor) -> torch.Tensor:
+ out.copy_(self)
+ return out
diff --git a/exir/tests/TARGETS b/exir/tests/TARGETS
index 322f72c870a..21493a69644 100644
--- a/exir/tests/TARGETS
+++ b/exir/tests/TARGETS
@@ -504,3 +504,14 @@ python_unittest(
"//executorch/exir/passes:propagate_device_pass",
],
)
+
+python_unittest(
+ name = "device_copy_ops",
+ srcs = [
+ "test_device_copy_ops.py",
+ ],
+ deps = [
+ "//caffe2:torch",
+ "//executorch/exir/passes:device_copy_ops_registry",
+ ],
+)
diff --git a/exir/tests/test_device_copy_ops.py b/exir/tests/test_device_copy_ops.py
new file mode 100644
index 00000000000..805159d9d81
--- /dev/null
+++ b/exir/tests/test_device_copy_ops.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+# Import the registry to register the ops
+import executorch.exir.passes._device_copy_ops_registry # noqa: F401
+
+import torch
+
+
+class DeviceCopyOpsRegistryTest(unittest.TestCase):
+ """Tests that et_copy._h2d_copy and et_copy._d2h_copy ops are correctly
+ registered and produce expected outputs during tracing (CPU-only)."""
+
+ def test_h2d_copy_functional(self):
+ """_h2d_copy should return a clone of the input tensor."""
+ x = torch.randn(2, 3)
+ result = torch.ops.et_copy._h2d_copy(x)
+ self.assertEqual(result.shape, x.shape)
+ self.assertEqual(result.dtype, x.dtype)
+ self.assertTrue(torch.equal(result, x))
+ # Should be a new tensor, not the same object
+ self.assertFalse(result.data_ptr() == x.data_ptr())
+
+ def test_d2h_copy_functional(self):
+ """_d2h_copy should return a clone of the input tensor."""
+ x = torch.randn(4, 5)
+ result = torch.ops.et_copy._d2h_copy(x)
+ self.assertEqual(result.shape, x.shape)
+ self.assertEqual(result.dtype, x.dtype)
+ self.assertTrue(torch.equal(result, x))
+ self.assertFalse(result.data_ptr() == x.data_ptr())
+
+ def test_h2d_copy_out_variant(self):
+ """_h2d_copy.out should copy data into the provided out tensor."""
+ x = torch.randn(3, 3)
+ out = torch.empty(3, 3)
+ result = torch.ops.et_copy._h2d_copy.out(x, out=out)
+ self.assertTrue(result is out)
+ self.assertTrue(torch.equal(out, x))
+
+ def test_d2h_copy_out_variant(self):
+ """_d2h_copy.out should copy data into the provided out tensor."""
+ x = torch.randn(2, 4)
+ out = torch.empty(2, 4)
+ result = torch.ops.et_copy._d2h_copy.out(x, out=out)
+ self.assertTrue(result is out)
+ self.assertTrue(torch.equal(out, x))
+
+ def test_h2d_copy_preserves_dtype(self):
+ """_h2d_copy should work with various dtypes."""
+ for dtype in [torch.float32, torch.float16, torch.int32, torch.int64]:
+ x = torch.ones(2, 2, dtype=dtype)
+ result = torch.ops.et_copy._h2d_copy(x)
+ self.assertEqual(result.dtype, dtype)
+ self.assertTrue(torch.equal(result, x))
+
+ def test_h2d_copy_scalar_tensor(self):
+ """_h2d_copy should handle 0-dim tensors."""
+ x = torch.tensor(3.14)
+ result = torch.ops.et_copy._h2d_copy(x)
+ self.assertEqual(result.shape, torch.Size([]))
+ self.assertTrue(torch.equal(result, x))
+
+ def test_d2h_copy_empty_tensor(self):
+ """_d2h_copy should handle empty tensors."""
+ x = torch.empty(0, 3)
+ result = torch.ops.et_copy._d2h_copy(x)
+ self.assertEqual(result.shape, torch.Size([0, 3]))
From d757776f51bc41aedac47fe51dd020474726774c Mon Sep 17 00:00:00 2001
From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com>
Date: Sat, 23 May 2026 11:50:33 -0700
Subject: [PATCH 06/91] Add extension_llm_runner to CMake deps (#19749)
Differential Revision: D106162684
Pull Request resolved: https://github.com/pytorch/executorch/pull/19749
---
examples/models/parakeet/main.cpp | 9 +++++----
extension/asr/runner/CMakeLists.txt | 2 +-
extension/asr/runner/transducer_runner.cpp | 16 ++++++++++++----
extension/asr/runner/transducer_runner.h | 13 +++++++++++--
4 files changed, 29 insertions(+), 11 deletions(-)
diff --git a/examples/models/parakeet/main.cpp b/examples/models/parakeet/main.cpp
index 249e8fd14d4..b8a052004e4 100644
--- a/examples/models/parakeet/main.cpp
+++ b/examples/models/parakeet/main.cpp
@@ -152,13 +152,14 @@ int main(int argc, char** argv) {
ET_LOG(Error, "Preprocessing failed.");
return 1;
}
- auto mel_features = preprocess_result.get();
+ auto preprocess_out = preprocess_result.get();
// --- Transcribe ---
ET_LOG(Info, "Running TDT greedy decode...");
- auto result = runner.transcribe(mel_features, [](const std::string& piece) {
- std::cout << piece << std::flush;
- });
+ auto result = runner.transcribe(
+ preprocess_out.features,
+ [](const std::string& piece) { std::cout << piece << std::flush; },
+ preprocess_out.length);
if (!result.ok()) {
ET_LOG(Error, "Transcription failed.");
diff --git a/extension/asr/runner/CMakeLists.txt b/extension/asr/runner/CMakeLists.txt
index 66974aa2a24..b47cddaf48c 100644
--- a/extension/asr/runner/CMakeLists.txt
+++ b/extension/asr/runner/CMakeLists.txt
@@ -22,7 +22,7 @@ endif()
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
set(runner_deps executorch_core extension_module extension_tensor
- tokenizers::tokenizers
+ extension_llm_runner tokenizers::tokenizers
)
# Define runner library
diff --git a/extension/asr/runner/transducer_runner.cpp b/extension/asr/runner/transducer_runner.cpp
index 3461cb09cc1..7b9298845a9 100644
--- a/extension/asr/runner/transducer_runner.cpp
+++ b/extension/asr/runner/transducer_runner.cpp
@@ -200,7 +200,7 @@ Error TransducerRunner::load() {
return Error::Ok;
}
-Result<::executorch::extension::TensorPtr> TransducerRunner::preprocess(
+Result TransducerRunner::preprocess(
::executorch::extension::TensorPtr raw_audio) {
if (!is_loaded()) {
ET_CHECK_OK_OR_RETURN_ERROR(load());
@@ -229,12 +229,18 @@ Result<::executorch::extension::TensorPtr> TransducerRunner::preprocess(
"Preprocessor returned unexpected output.");
auto mel = outputs[0].toTensor();
- return std::make_shared<::executorch::aten::Tensor>(std::move(mel));
+ int64_t mel_len = mel.sizes()[1]; // default to tensor dim
+ if (outputs.size() >= 2 && outputs[1].isTensor()) {
+ mel_len = outputs[1].toTensor().const_data_ptr()[0];
+ }
+ return PreprocessResult{
+ std::make_shared<::executorch::aten::Tensor>(std::move(mel)), mel_len};
}
Result> TransducerRunner::transcribe(
::executorch::extension::TensorPtr preprocessed_features,
- std::function token_callback) {
+ std::function token_callback,
+ int64_t features_length) {
if (!is_loaded()) {
ET_CHECK_OK_OR_RETURN_ERROR(load());
}
@@ -242,7 +248,9 @@ Result> TransducerRunner::transcribe(
stats_.inference_start_ms = ::executorch::extension::llm::time_in_ms();
// --- Encode ---
- int64_t mel_len_value = preprocessed_features->size(1);
+ // Use provided length, or fall back to tensor dimension
+ int64_t mel_len_value =
+ features_length > 0 ? features_length : preprocessed_features->size(1);
std::vector mel_len_data = {mel_len_value};
auto mel_len = ::executorch::extension::from_blob(
mel_len_data.data(), {1}, ::executorch::aten::ScalarType::Long);
diff --git a/extension/asr/runner/transducer_runner.h b/extension/asr/runner/transducer_runner.h
index ee819590141..aed0ad84cd6 100644
--- a/extension/asr/runner/transducer_runner.h
+++ b/extension/asr/runner/transducer_runner.h
@@ -29,6 +29,14 @@ using ::executorch::extension::llm::Stats;
using ::executorch::runtime::Error;
using ::executorch::runtime::Result;
+/**
+ * Preprocessed audio features with actual (unpadded) length.
+ */
+struct PreprocessResult {
+ ::executorch::extension::TensorPtr features;
+ int64_t length; // Actual number of valid frames (excluding padding)
+};
+
/**
* A decoded token with frame-level timing information.
*/
@@ -97,7 +105,7 @@ class ET_EXPERIMENTAL TransducerRunner {
* @returns Preprocessed features tensor (e.g., mel spectrogram),
* ready to pass to transcribe().
*/
- Result<::executorch::extension::TensorPtr> preprocess(
+ Result preprocess(
::executorch::extension::TensorPtr raw_audio);
/**
@@ -112,7 +120,8 @@ class ET_EXPERIMENTAL TransducerRunner {
*/
Result> transcribe(
::executorch::extension::TensorPtr preprocessed_features,
- std::function token_callback = {});
+ std::function token_callback = {},
+ int64_t features_length = -1);
/**
* Returns a reference to the loaded tokenizer, or nullptr if not loaded.
From b69cbcd6ffefe6e13fa25c4ea9285786b04692ca Mon Sep 17 00:00:00 2001
From: roman-janik-nxp
Date: Sun, 24 May 2026 11:43:13 +0200
Subject: [PATCH 07/91] NXP backend: Enable Add Tensor with new Neutron flow
(#19550)
### Summary
Add tests verifying correct support for add.tensor by the Neutron
backend using the new Neutron MLIR flow.
### Test plan
Unit tests provided.
cc @robert-kalmar
---
.../ops_converters/add_tensor_converter.py | 42 ++-
.../test_add_tensor_converter.py | 263 +++++++++++++++++-
backends/nxp/tests/models.py | 4 +-
backends/nxp/tests/ops_aliases.py | 1 +
4 files changed, 293 insertions(+), 17 deletions(-)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
index fd28b077b8a..673af19310f 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
@@ -3,6 +3,9 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+import torch
+
+from executorch.backends.nxp.backend.data_format import NXP_NODE_FORMAT
from executorch.backends.nxp.backend.ir.converter.node_converter import (
CustomDelegationOptions,
NodeConverter,
@@ -23,11 +26,33 @@ def _is_supported_on_target(
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
) -> bool:
- if NodeConverter.uses_shape_broadcasting(node):
- # Shape broadcasting may require the addition of `Transpose` ops during conversion.
- return False
+ if custom_delegation_options.use_new_flow_neutron_c:
+ if not NodeConverter.at_least_one_input_shape_matches_the_output_shape(
+ node
+ ):
+ return False
- return True
+ # If one input is in channel first and ranks of input tensors are not equal, we need to add Transposes
+ # Transpose is currently not supported for new flow
+ if any(
+ input_node.meta[NXP_NODE_FORMAT].is_channels_first()
+ for input_node in node.all_input_nodes
+ ) and NodeConverter._node_inputs_ranks_not_equal(node):
+ return False
+
+ supported_types = [torch.int8, torch.uint8]
+ if not NodeConverter.uses_quantization_type_for_io(
+ node, supported_types, [0, 1], [0]
+ ):
+ return False
+
+ return True
+ else:
+ if NodeConverter.uses_shape_broadcasting(node):
+ # Shape broadcasting may require the addition of `Transpose` ops during conversion.
+ return False
+
+ return True
@staticmethod
def _is_supported_in_IR(
@@ -43,12 +68,13 @@ def _is_supported_in_IR(
return True
- # add.Tensor Node format: (Tensor self, Tensor other, *, Scalar alpha=1)
def convert(self, node: Node):
- """Convert 'add_tensor' operator to TFLite 'add'."""
+ """Convert 'add_tensor' operator to NeutronIR 'Add'.
+ The ExecuTorch schema is:
+ add.Tensor(Tensor self, Tensor other, Scalar alpha=1)
+ """
self.assert_convertible(node)
-
t_op = self._create_tflite_op_with_io_tensors(node)
-
t_op.builtin_options = add_options.Add()
+
self.builder.append_operators([t_op])
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
index 1aa58ab5d95..4a656eb9517 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
@@ -1,7 +1,8 @@
-# Copyright 2025 NXP
+# Copyright 2025-2026 NXP
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+
import numpy as np
import pytest
import torch
@@ -9,17 +10,29 @@
from executorch.backends.nxp.backend.edge_program_converter import (
EdgeProgramToIRConverter,
)
-from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator
+from executorch.backends.nxp.tests.executorch_pipeline import (
+ ModelInputSpec,
+ to_quantized_edge_program,
+)
from executorch.backends.nxp.tests.executors import (
convert_run_compare,
+ graph_contains_any_of_ops,
ToChannelFirstPreprocess,
ToChannelLastPreprocess,
)
+from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
from executorch.backends.nxp.tests.models import (
AddTensorConvModule,
AddTensorModule,
AddTensorOneInputModule,
)
+from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
+from executorch.backends.nxp.tests.ops_aliases import (
+ AddTensor,
+ Convolution,
+ ExecutorchDelegateCall,
+)
from torch.export import ExportedProgram
from executorch.backends.nxp.tests.use_qat import * # noqa F403
@@ -92,20 +105,26 @@ def test_add_tensor_one_input_quant_conversion(mocker, input_shape, use_qat):
@pytest.mark.parametrize(
- "input_shape",
+ "x_input_shape",
[
pytest.param((1, 4, 8, 8), id="4D."),
pytest.param((1, 4, 5, 5), id="4D, product of dims is not a multiple of 8."),
],
)
-def test_add_tensor_w_conv_quant_conversion(mocker, input_shape, use_qat):
+def test_add_tensor_w_conv_quant_conversion(mocker, x_input_shape, use_qat):
model = AddTensorConvModule()
converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+ n, c, h, w = x_input_shape
+ y_input_shape = (n, 8, h, w)
+
# Run conversion
_ = to_quantized_edge_program(
- model, input_shape, use_qat=use_qat, use_neutron_for_format_conversion=False
+ model,
+ [x_input_shape, y_input_shape],
+ use_qat=use_qat,
+ use_neutron_for_format_conversion=False,
)
# Capture generated model
@@ -114,7 +133,13 @@ def test_add_tensor_w_conv_quant_conversion(mocker, input_shape, use_qat):
# Capture converted program
exported_program: ExportedProgram = converter_spy.call_args.args[1]
- input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+ input_data_1 = (np.random.random(x_input_shape).astype(np.float32) * 50).astype(
+ np.int8
+ )
+ input_data_2 = (np.random.random(y_input_shape).astype(np.float32) * 50).astype(
+ np.int8
+ )
+ input_data = {0: input_data_1, 1: input_data_2}
convert_run_compare(
exported_program,
@@ -149,7 +174,7 @@ def test_add_tensor_broadcasting_unsupported_quant_conversion(
nodes = list(edge_program.graph.nodes)
# Broadcast is not supported, node is not converted
- assert nodes[6].target.__name__ == "aten.add.Tensor" # Add Tensor is not delegated.
+ assert nodes[6].target == AddTensor # Add Tensor is not delegated.
# Capture converted program
# exported_program: ExportedProgram = converter_spy.call_args.args[1]
@@ -159,3 +184,227 @@ def test_add_tensor_broadcasting_unsupported_quant_conversion(
# input_data = {0: x_input_data, 1: y_input_data}
#
# convert_run_compare(exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data)
+
+
+class TestAddTensorNewNeutronFlow:
+ @pytest.mark.parametrize(
+ "x_input_shape",
+ [
+ pytest.param((1,), id="1D."),
+ pytest.param((6, 5), id="2D."),
+ pytest.param((1, 4, 7), id="3D."),
+ pytest.param((2, 4, 3, 15), id="4D."),
+ pytest.param(
+ (6, 82),
+ id="2D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ pytest.param(
+ (1, 68, 7),
+ id="3D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ pytest.param(
+ (1, 4, 9, 11, 4),
+ id="5D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ ],
+ )
+ def test__basic_nsys_inference(self, x_input_shape, mocker):
+ x_input_spec = ModelInputSpec(x_input_shape)
+ model = AddTensorModule()
+ graph_verifier = DetailedGraphVerifier(
+ mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={}
+ )
+ dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+
+ lower_run_compare(
+ model,
+ [x_input_spec, x_input_spec],
+ graph_verifier,
+ dataset_creator,
+ use_new_flow_neutron_c=True,
+ )
+
+ @pytest.mark.parametrize(
+ "x_input_shape",
+ [
+ pytest.param((1,), id="1D."),
+ pytest.param((6, 5), id="2D."),
+ pytest.param((1, 4, 7), id="3D."),
+ pytest.param((2, 4, 3, 15), id="4D."),
+ pytest.param(
+ (1, 4, 9, 11, 4),
+ id="5D.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ ],
+ )
+ def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
+ x_input_spec = ModelInputSpec(x_input_shape)
+ model = AddTensorModule()
+ graph_verifier = DetailedGraphVerifier(
+ mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={}
+ )
+ dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+
+ lower_run_compare(
+ model,
+ [x_input_spec, x_input_spec],
+ graph_verifier,
+ dataset_creator,
+ use_new_flow_neutron_c=True,
+ use_qat=True,
+ )
+
+ @pytest.mark.parametrize(
+ "input_spec",
+ [
+ pytest.param(
+ [ModelInputSpec((4, 6)), ModelInputSpec((1, 6))], id="2 inputs 2D."
+ ),
+ pytest.param(
+ [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))],
+ id="2 inputs 3D.",
+ ),
+ pytest.param(
+ [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D."
+ ),
+ pytest.param(
+ [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))],
+ id="2 inputs 2D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ ],
+ )
+ def test__broadcast(self, input_spec, mocker):
+ model = AddTensorModule()
+ graph_verifier = DetailedGraphVerifier(
+ mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={}
+ )
+ dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+
+ lower_run_compare(
+ model,
+ input_spec,
+ graph_verifier,
+ dataset_creator,
+ use_new_flow_neutron_c=True,
+ )
+
+ @pytest.mark.parametrize(
+ "input_spec",
+ [
+ pytest.param(
+ [ModelInputSpec((4, 1)), ModelInputSpec((1, 6))], id="2 inputs 2D."
+ ),
+ pytest.param(
+ [ModelInputSpec((1, 3, 4)), ModelInputSpec((5, 3, 1))],
+ id="2 inputs 3D.",
+ ),
+ pytest.param(
+ [ModelInputSpec((6, 4)), ModelInputSpec((6, 6, 1))],
+ id="2 inputs 2D + 3D.",
+ ),
+ ],
+ )
+ def test__broadcast_unsupported(self, input_spec):
+ # Broadcast where at least one of the inputs is not equal to output is not supported
+ model = AddTensorModule()
+
+ delegated_ep = to_quantized_edge_program(
+ model, input_spec, use_new_flow_neutron_c=True
+ ).exported_program()
+
+ # Make sure the `add.Tensor` was NOT delegated.
+ assert not graph_contains_any_of_ops(
+ delegated_ep.graph, [ExecutorchDelegateCall]
+ )
+ assert graph_contains_any_of_ops(delegated_ep.graph, [AddTensor])
+
+ @pytest.mark.parametrize(
+ "x_input_shape",
+ [
+ pytest.param(
+ (1, 4, 5, 5), id="4D, product of dims is not a multiple of 8."
+ ),
+ ],
+ )
+ def test__w_conv(self, x_input_shape, mocker):
+ model = AddTensorConvModule()
+
+ n, c, h, w = x_input_shape
+ y_input_spec = ModelInputSpec((n, 8, h, w))
+ x_input_spec = ModelInputSpec(x_input_shape)
+
+ graph_verifier = DetailedGraphVerifier(
+ mocker,
+ expected_delegated_ops={AddTensor: 1, Convolution: 1},
+ expected_non_delegated_ops={},
+ )
+ dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+
+ lower_run_compare(
+ model,
+ [x_input_spec, y_input_spec],
+ graph_verifier,
+ dataset_creator,
+ use_new_flow_neutron_c=True,
+ )
+
+ @pytest.mark.parametrize(
+ "input_spec",
+ [
+ pytest.param(
+ [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 8, 5, 1))],
+ id="2 inputs 4D + 4D.",
+ ),
+ pytest.param(
+ [ModelInputSpec((1, 4, 5, 67)), ModelInputSpec((1, 8, 5, 1))],
+ id="2 inputs 4D + 4D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ ],
+ )
+ def test__w_conv_broadcast(self, input_spec, mocker):
+ model = AddTensorConvModule()
+
+ graph_verifier = DetailedGraphVerifier(
+ mocker,
+ expected_delegated_ops={AddTensor: 1, Convolution: 1},
+ expected_non_delegated_ops={},
+ )
+ dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+
+ lower_run_compare(
+ model,
+ input_spec,
+ graph_verifier,
+ dataset_creator,
+ use_new_flow_neutron_c=True,
+ )
+
+ @pytest.mark.parametrize(
+ "input_spec",
+ [
+ pytest.param(
+ [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 5))],
+ id="2 inputs 4D + 2D.",
+ ),
+ pytest.param(
+ [ModelInputSpec((1, 4, 4, 10)), ModelInputSpec((1, 4, 1))],
+ id="2 inputs 4D + 3D.",
+ ),
+ ],
+ )
+ def test__w_conv_unsupported(self, input_spec):
+ model = AddTensorConvModule()
+
+ delegated_ep = to_quantized_edge_program(
+ model, input_spec, use_new_flow_neutron_c=True
+ ).exported_program()
+
+ # Make sure the `add.Tensor` was NOT delegated.
+ assert graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
+ assert graph_contains_any_of_ops(delegated_ep.graph, [AddTensor])
diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py
index 045dcfaba40..1292c4cf17d 100644
--- a/backends/nxp/tests/models.py
+++ b/backends/nxp/tests/models.py
@@ -656,9 +656,9 @@ def __init__(self):
super().__init__()
self.conv = Conv2dModule(padding=1, stride=1)
- def forward(self, x):
+ def forward(self, x, y):
x = self.conv(x)
- return x + x
+ return x + y
class AddTensorOneInputModule(torch.nn.Module):
diff --git a/backends/nxp/tests/ops_aliases.py b/backends/nxp/tests/ops_aliases.py
index ec58072658d..9e6bedc5dba 100644
--- a/backends/nxp/tests/ops_aliases.py
+++ b/backends/nxp/tests/ops_aliases.py
@@ -13,6 +13,7 @@
Abs = exir_ops.edge.aten.abs.default
AdaptiveAvgPool2D = exir_ops.edge.aten._adaptive_avg_pool2d.default
+AddTensor = exir_ops.edge.aten.add.Tensor
AvgPool2D = exir_ops.edge.aten.avg_pool2d.default
Bmm = exir_ops.edge.aten.bmm.default
ConstantPadND = exir_ops.edge.aten.constant_pad_nd.default
From ba6074c3868abb8f602a22565445b52f8b5bdfb1 Mon Sep 17 00:00:00 2001
From: Julian Chan <128482247+julianchan-meta@users.noreply.github.com>
Date: Sun, 24 May 2026 23:53:19 -0700
Subject: [PATCH 08/91] Back out "Globally serialize XNNPACK execution, add
logging" (#19752)
Differential Revision: D106254596
Pull Request resolved: https://github.com/pytorch/executorch/pull/19752
---
backends/xnnpack/runtime/XNNPACKBackend.cpp | 53 +--------------------
1 file changed, 2 insertions(+), 51 deletions(-)
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
index 2fe1e4d162e..c20fa985f46 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.cpp
+++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -16,7 +16,6 @@
#include
#include
-#include
#include
#include
@@ -42,13 +41,6 @@ using executorch::runtime::FreeableBuffer;
using executorch::runtime::Result;
using executorch::runtime::Span;
-// Global mutex for all XNNPACK operations. This is temporary, tracked by
-// T272407942.
-static std::mutex& global_xnnpack_mutex() {
- static std::mutex m;
- return m;
-}
-
class XnnpackBackend final
: public ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface {
public:
@@ -74,8 +66,6 @@ class XnnpackBackend final
BackendInitContext& context,
FreeableBuffer* processed,
ArrayRef compile_specs) const override {
- const std::lock_guard global_lock(global_xnnpack_mutex());
-
auto executor = context.get_runtime_allocator()
->allocateInstance();
if (executor == nullptr) {
@@ -139,17 +129,6 @@ class XnnpackBackend final
Error, "XNNCompiler::compileModel failed: 0x%x", (unsigned int)err);
return err;
}
-
- ET_LOG(
- Info,
- "XnnpackBackend::init delegate=%p workspace_id=%" PRIu64
- " workspace_ptr=%p program_id=0x%" PRIxPTR " weight_cache=%s",
- (void*)executor,
- workspace->id(),
- (void*)workspace_ptr,
- program_id,
- use_weight_cache ? "true" : "false");
-
return executor;
}
@@ -157,27 +136,15 @@ class XnnpackBackend final
BackendExecutionContext& context,
DelegateHandle* handle,
Span args) const override {
- const std::lock_guard global_lock(global_xnnpack_mutex());
-
auto executor = static_cast(handle);
- auto workspace = executor->get_workspace();
- ET_LOG(
- Info,
- "XnnpackBackend::execute begin delegate=%p workspace_id=%" PRIu64
- " num_args=%zu weight_cache=%s",
- (void*)executor,
- workspace->id(),
- (size_t)args.size(),
- executor->uses_weight_cache() ? "true" : "false");
-
std::unique_lock lock_weights_cache(
weights_cache_mutex_, std::defer_lock);
if (executor->uses_weight_cache()) {
lock_weights_cache.lock();
}
- auto [raii_lock, _] = workspace->acquire();
+ auto [raii_lock, _] = executor->get_workspace()->acquire();
// Prepare Inputs/Outputs and Propagate Input Shapes
Error err = executor->prepare_args(args);
@@ -194,29 +161,12 @@ class XnnpackBackend final
// Convert output data types if necessary (e.g., int32 -> int64 for Long)
err = executor->convert_outputs(args);
- ET_LOG(
- Info,
- "XnnpackBackend::execute end delegate=%p workspace_id=%" PRIu64
- " err=0x%x",
- (void*)executor,
- workspace->id(),
- (unsigned int)err);
-
return err;
}
void destroy(DelegateHandle* handle) const override {
if (handle != nullptr) {
- const std::lock_guard global_lock(global_xnnpack_mutex());
-
auto executor = static_cast(handle);
- auto workspace = executor->get_workspace();
-
- ET_LOG(
- Info,
- "XnnpackBackend::destroy delegate=%p workspace_id=%" PRIu64,
- (void*)executor,
- workspace->id());
#ifdef ENABLE_XNNPACK_PROFILING
executor->print_avg_op_timings();
@@ -233,6 +183,7 @@ class XnnpackBackend final
// the same backend instance. Make sure to hold onto the workspace
// shared_ptr, as the pointer in the executor is freed, which includes
// the mutex referenced by raii_lock.
+ auto workspace = executor->get_workspace();
auto [raii_lock, _] = workspace->acquire();
// XNNExecutor is not trivially destructible. Since this was constructed
From ee4c90ad03f33398cbfa93cfed09caf04fca6099 Mon Sep 17 00:00:00 2001
From: Per Held
Date: Mon, 25 May 2026 08:59:44 +0200
Subject: [PATCH 09/91] Arm backend: Exclude build metadata from license checks
Treat BUCK and TARGETS files as build metadata in the Arm
pre-push license check so they do not need copyright headers.
Signed-off-by: Per Held
Change-Id: I4b3bbd1e03ba4b9c38fd06225156344985f0cc70
---
backends/arm/scripts/pre-push | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push
index 8e26463cd94..6aa32d07286 100755
--- a/backends/arm/scripts/pre-push
+++ b/backends/arm/scripts/pre-push
@@ -177,7 +177,7 @@ for COMMIT in ${COMMITS}; do
for committed_file in "${license_files[@]}"; do
# Skip files with certain extensions
case "$committed_file" in
- *.md|*.md.in|*.json|*.yml|*.yaml|*.cmake|*.patch|.gitignore|*.bzl)
+ *.md|*.md.in|*.json|*.yml|*.yaml|*.cmake|*.patch|.gitignore|*.bzl|BUCK|*/BUCK|TARGETS|*/TARGETS)
echo -e "${INFO} Skipping license check for ${committed_file} (excluded extension)"
continue
;;
From b73df0b4696885c6e03f3789daeece8376078364 Mon Sep 17 00:00:00 2001
From: roman-janik-nxp
Date: Mon, 25 May 2026 13:49:04 +0200
Subject: [PATCH 10/91] NXP backend: Enable Sub Tensor with new Neutron flow
(#19588)
### Summary
Add tests verifying correct support for sub.tensor by the Neutron
backend using the new Neutron MLIR flow.
### Test plan
Unit tests provided.
cc @robert-kalmar @JakeStevens @digantdesai @rascani
---
.../ops_converters/sub_tensor_converter.py | 40 ++-
.../test_avg_pool2d_converter.py | 9 +-
.../test_max_pool_2d_converter.py | 7 +-
.../test_mul_tensor_converter.py | 5 -
.../test_sub_tensor_converter.py | 260 +++++++++++++++++-
backends/nxp/tests/ops_aliases.py | 1 +
6 files changed, 289 insertions(+), 33 deletions(-)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
index e97f4bf63c2..79dbcbcc012 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
@@ -3,6 +3,9 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+import torch
+
+from executorch.backends.nxp.backend.data_format import NXP_NODE_FORMAT
from executorch.backends.nxp.backend.ir.converter.node_converter import (
CustomDelegationOptions,
NodeConverter,
@@ -23,11 +26,33 @@ def _is_supported_on_target(
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
) -> bool:
- if NodeConverter.uses_shape_broadcasting(node):
- # Shape broadcasting may require the addition of `Transpose` ops during conversion.
- return False
+ if custom_delegation_options.use_new_flow_neutron_c:
+ if not NodeConverter.at_least_one_input_shape_matches_the_output_shape(
+ node
+ ):
+ return False
- return True
+ # If one input is in channel first and ranks of input tensors are not equal, we need to add Transposes
+ # Transpose is currently not supported for new flow
+ if any(
+ input_node.meta[NXP_NODE_FORMAT].is_channels_first()
+ for input_node in node.all_input_nodes
+ ) and NodeConverter._node_inputs_ranks_not_equal(node):
+ return False
+
+ supported_types = [torch.int8, torch.uint8]
+ if not NodeConverter.uses_quantization_type_for_io(
+ node, supported_types, [0, 1], [0]
+ ):
+ return False
+
+ return True
+ else:
+ if NodeConverter.uses_shape_broadcasting(node):
+ # Shape broadcasting may require the addition of `Transpose` ops during conversion.
+ return False
+
+ return True
@staticmethod
def _is_supported_in_IR(
@@ -45,9 +70,12 @@ def _is_supported_in_IR(
return True
- # sub.Tensor Node format: (Tensor self, Tensor other, *, Scalar alpha=1)
def convert(self, node: Node):
- """Convert 'sub_tensor' operator to NeutronIR 'Sub'."""
+ """Convert 'sub_tensor' operator to NeutronIR 'Sub'.
+ The ExecuTorch schema is:
+ sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1)
+ """
+
self.assert_convertible(node)
t_op = self._create_tflite_op_with_io_tensors(node)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
index 2c73ccd8092..193b7ecf9ab 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py
@@ -6,6 +6,7 @@
import numpy as np
import pytest
import torch
+
from executorch.backends.nxp.backend.edge_program_converter import (
EdgeProgramToIRConverter,
)
@@ -29,13 +30,8 @@
ToNHWCPreprocess,
)
from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
-from executorch.backends.nxp.tests.model_output_comparator import (
- NumericalStatsOutputComparator,
-)
from executorch.backends.nxp.tests.models import AvgPool2dConvModule, AvgPool2dModule
-
from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
-
from executorch.backends.nxp.tests.ops_aliases import (
AvgPool2D,
ExecutorchDelegateCall,
@@ -45,6 +41,7 @@
Unsqueeze,
ViewCopy,
)
+
from torch.export import ExportedProgram
from executorch.backends.nxp.tests.use_qat import * # noqa F403
@@ -320,7 +317,6 @@ def test__basic_nsys_inference(self, mocker):
def test__basic_nsys_inference_qat(self, mocker):
input_shape = (2, 9, 6, 15)
model = AvgPool2dModule(False, 0)
- comparator = NumericalStatsOutputComparator()
graph_verifier = DetailedGraphVerifier(
mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={}
)
@@ -329,7 +325,6 @@ def test__basic_nsys_inference_qat(self, mocker):
model,
input_shape,
graph_verifier,
- output_comparator=comparator,
use_new_flow_neutron_c=True,
use_qat=True,
)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
index 583dc2bfd04..9062d5efbfc 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py
@@ -4,6 +4,7 @@
# LICENSE file in the root directory of this source tree.
import numpy as np
+import pytest
import torch
from executorch.backends.nxp.backend.edge_program_converter import (
@@ -17,9 +18,6 @@
ToChannelLastPreprocess,
)
from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
-from executorch.backends.nxp.tests.model_output_comparator import (
- NumericalStatsOutputComparator,
-)
from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
from executorch.backends.nxp.tests.ops_aliases import (
ExecutorchDelegateCall,
@@ -32,7 +30,6 @@
ViewCopy,
)
from executorch.backends.nxp.tests.use_qat import * # noqa F403
-import pytest
class MaxPool1DModule(torch.nn.Module):
@@ -286,7 +283,6 @@ def test__basic_nsys_inference(self, mocker):
def test__basic_nsys_inference_qat(self, mocker):
input_shape = (2, 11, 7, 16) # The old flow limited the batch size to 1.
model = MaxPool2dModule()
- comparator = NumericalStatsOutputComparator()
graph_verifier = DetailedGraphVerifier(
mocker,
expected_delegated_ops={MaxPool2DWithIndices: 1, GetItem: 1},
@@ -297,7 +293,6 @@ def test__basic_nsys_inference_qat(self, mocker):
model,
input_shape,
graph_verifier,
- output_comparator=comparator,
use_new_flow_neutron_c=True,
use_qat=True,
)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py
index 927af47bbf5..90113f484ad 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py
@@ -21,9 +21,6 @@
ToChannelLastPreprocess,
)
from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
-from executorch.backends.nxp.tests.model_output_comparator import (
- NumericalStatsOutputComparator,
-)
from executorch.backends.nxp.tests.models import (
MulTensorConvModule,
MulTensorModule,
@@ -256,7 +253,6 @@ def test__basic_nsys_inference(self, x_input_shape, mocker):
def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
x_input_spec = ModelInputSpec(x_input_shape)
model = MulTensorModule()
- comparator = NumericalStatsOutputComparator()
graph_verifier = DetailedGraphVerifier(
mocker, expected_delegated_ops={MulTensor: 1}, expected_non_delegated_ops={}
)
@@ -265,7 +261,6 @@ def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
model,
[x_input_spec, x_input_spec],
graph_verifier,
- output_comparator=comparator,
use_new_flow_neutron_c=True,
use_qat=True,
)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
index 9ce3e93f39b..2734e89bc5d 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
@@ -1,7 +1,8 @@
-# Copyright 2025 NXP
+# Copyright 2025-2026 NXP
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+
import numpy as np
import pytest
import torch
@@ -9,18 +10,29 @@
from executorch.backends.nxp.backend.edge_program_converter import (
EdgeProgramToIRConverter,
)
-from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator
+from executorch.backends.nxp.tests.executorch_pipeline import (
+ ModelInputSpec,
+ to_quantized_edge_program,
+)
from executorch.backends.nxp.tests.executors import (
convert_run_compare,
+ graph_contains_any_of_ops,
ToChannelFirstPreprocess,
ToChannelLastPreprocess,
)
+from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
from executorch.backends.nxp.tests.models import (
SubTensorConvModule,
SubTensorModule,
SubTensorOneInputModule,
)
-from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
+from executorch.backends.nxp.tests.ops_aliases import (
+ Convolution,
+ ExecutorchDelegateCall,
+ SubTensor,
+)
from torch.export import ExportedProgram
from executorch.backends.nxp.tests.use_qat import * # noqa F403
@@ -63,7 +75,7 @@ def test_sub_tensor_quant_conversion(mocker, input_shape, use_qat):
input_data = {0: input_data_1, 1: input_data_2}
nodes = list(exported_program.graph.nodes)
- assert nodes[4].target == exir_ops.edge.aten.sub.Tensor
+ assert nodes[4].target == SubTensor
convert_run_compare(
exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data
@@ -96,7 +108,7 @@ def test_sub_tensor_one_input_quant_conversion(mocker, input_shape, use_qat):
input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
nodes = list(exported_program.graph.nodes)
- assert nodes[2].target == exir_ops.edge.aten.sub.Tensor
+ assert nodes[2].target == SubTensor
convert_run_compare(
exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data
@@ -141,7 +153,7 @@ def test_sub_tensor_w_conv_quant_conversion(mocker, x_input_shape, use_qat):
input_data = {0: input_data_1, 1: input_data_2}
nodes = list(exported_program.graph.nodes)
- assert nodes[15].target == exir_ops.edge.aten.sub.Tensor
+ assert nodes[15].target == SubTensor
convert_run_compare(
exported_program,
@@ -176,6 +188,236 @@ def test_sub_tensor_broadcasting_unsupported_quant_conversion(
nodes = list(edge_program.graph.nodes)
# Broadcast is not supported, node is not converted
- assert (
- nodes[6].target == exir_ops.edge.aten.sub.Tensor
- ) # Sub Tensor is not delegated.
+ assert nodes[6].target == SubTensor # Sub Tensor is not delegated.
+
+
+class TestSubTensorNewNeutronFlow:
+ @pytest.mark.parametrize(
+ "x_input_shape",
+ [
+ pytest.param((1,), id="1D."),
+ pytest.param((6, 5), id="2D."),
+ pytest.param((1, 4, 7), id="3D."),
+ pytest.param(
+ (6, 82),
+ id="2D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ pytest.param(
+ (1, 68, 7),
+ id="3D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ pytest.param(
+ (2, 4, 3, 15),
+ id="4D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ pytest.param(
+ (1, 4, 9, 11, 4),
+ id="5D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ ],
+ )
+ def test__basic_nsys_inference(self, x_input_shape, mocker):
+ x_input_spec = ModelInputSpec(x_input_shape)
+ model = SubTensorModule()
+ graph_verifier = DetailedGraphVerifier(
+ mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={}
+ )
+ dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+
+ lower_run_compare(
+ model,
+ [x_input_spec, x_input_spec],
+ graph_verifier,
+ dataset_creator,
+ use_new_flow_neutron_c=True,
+ )
+
+ @pytest.mark.parametrize(
+ "x_input_shape",
+ [
+ pytest.param((1,), id="1D."),
+ pytest.param((6, 5), id="2D."),
+ pytest.param((2, 4, 3, 15), id="4D."),
+ pytest.param(
+ (1, 4, 7),
+ id="3D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ pytest.param(
+ (1, 4, 9, 11, 4),
+ id="5D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ ],
+ )
+ def test__basic_nsys_inference_qat(self, x_input_shape, mocker):
+ x_input_spec = ModelInputSpec(x_input_shape)
+ model = SubTensorModule()
+ graph_verifier = DetailedGraphVerifier(
+ mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={}
+ )
+ dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+
+ lower_run_compare(
+ model,
+ [x_input_spec, x_input_spec],
+ graph_verifier,
+ dataset_creator,
+ use_new_flow_neutron_c=True,
+ use_qat=True,
+ )
+
+ @pytest.mark.parametrize(
+ "input_spec",
+ [
+ pytest.param(
+ [ModelInputSpec((4, 6)), ModelInputSpec((1, 6))], id="2 inputs 2D."
+ ),
+ pytest.param(
+ [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D."
+ ),
+ pytest.param(
+ [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))],
+ id="2 inputs 3D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ pytest.param(
+ [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))],
+ id="2 inputs 2D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ ],
+ )
+ def test__broadcast(self, input_spec, mocker):
+ model = SubTensorModule()
+ graph_verifier = DetailedGraphVerifier(
+ mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={}
+ )
+ dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+
+ lower_run_compare(
+ model,
+ input_spec,
+ graph_verifier,
+ dataset_creator,
+ use_new_flow_neutron_c=True,
+ )
+
+ @pytest.mark.parametrize(
+ "input_spec",
+ [
+ pytest.param(
+ [ModelInputSpec((4, 1)), ModelInputSpec((1, 6))], id="2 inputs 2D."
+ ),
+ pytest.param(
+ [ModelInputSpec((1, 3, 4)), ModelInputSpec((5, 3, 1))],
+ id="2 inputs 3D.",
+ ),
+ pytest.param(
+ [ModelInputSpec((6, 4)), ModelInputSpec((6, 6, 1))],
+ id="2 inputs 2D+3D.",
+ ),
+ ],
+ )
+ def test__broadcast_unsupported(self, input_spec):
+ # Broadcast where at least one of the inputs is not equal to output is not supported
+ model = SubTensorModule()
+
+ delegated_ep = to_quantized_edge_program(
+ model, input_spec, use_new_flow_neutron_c=True
+ ).exported_program()
+
+ # Make sure the `sub.Tensor` was NOT delegated.
+ assert not graph_contains_any_of_ops(
+ delegated_ep.graph, [ExecutorchDelegateCall]
+ )
+ assert graph_contains_any_of_ops(delegated_ep.graph, [SubTensor])
+
+ @pytest.mark.parametrize(
+ "x_input_shape",
+ [
+ pytest.param(
+ (1, 4, 5, 5), id="4D, product of dims is not a multiple of 8."
+ ),
+ ],
+ )
+ def test__w_conv(self, x_input_shape, mocker):
+ model = SubTensorConvModule()
+
+ n, c, h, w = x_input_shape
+ y_input_spec = ModelInputSpec((n, 8, h, w))
+ x_input_spec = ModelInputSpec(x_input_shape)
+
+ graph_verifier = DetailedGraphVerifier(
+ mocker,
+ expected_delegated_ops={SubTensor: 1, Convolution: 1},
+ expected_non_delegated_ops={},
+ )
+ dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+
+ lower_run_compare(
+ model,
+ [x_input_spec, y_input_spec],
+ graph_verifier,
+ dataset_creator,
+ use_new_flow_neutron_c=True,
+ )
+
+ @pytest.mark.parametrize(
+ "input_spec",
+ [
+ pytest.param(
+ [ModelInputSpec((1, 4, 7, 1)), ModelInputSpec((1, 8, 1, 1))],
+ id="2 inputs 4D + 4D.",
+ ),
+ pytest.param(
+ [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 8, 5, 1))],
+ id="2 inputs 4D + 4D incorrect.",
+ marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"),
+ ),
+ ],
+ )
+ def test__w_conv_broadcast(self, input_spec, mocker):
+ model = SubTensorConvModule()
+ graph_verifier = DetailedGraphVerifier(
+ mocker,
+ expected_delegated_ops={SubTensor: 1, Convolution: 1},
+ expected_non_delegated_ops={},
+ )
+ dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0)
+
+ lower_run_compare(
+ model,
+ input_spec,
+ graph_verifier,
+ dataset_creator,
+ use_new_flow_neutron_c=True,
+ )
+
+ @pytest.mark.parametrize(
+ "input_spec",
+ [
+ pytest.param(
+ [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 5))],
+ id="2 inputs 4D + 2D.",
+ ),
+ pytest.param(
+ [ModelInputSpec((1, 4, 4, 10)), ModelInputSpec((1, 4, 1))],
+ id="2 inputs 4D + 3D.",
+ ),
+ ],
+ )
+ def test__w_conv_unsupported(self, input_spec):
+ model = SubTensorConvModule()
+
+ delegated_ep = to_quantized_edge_program(
+ model, input_spec, use_new_flow_neutron_c=True
+ ).exported_program()
+
+ # Make sure the `sub.Tensor` was NOT delegated.
+ assert graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
+ assert graph_contains_any_of_ops(delegated_ep.graph, [SubTensor])
diff --git a/backends/nxp/tests/ops_aliases.py b/backends/nxp/tests/ops_aliases.py
index 9e6bedc5dba..7f855dd63af 100644
--- a/backends/nxp/tests/ops_aliases.py
+++ b/backends/nxp/tests/ops_aliases.py
@@ -37,6 +37,7 @@
Squeeze = exir_ops.edge.aten.squeeze.default
SqueezeDim = exir_ops.edge.aten.squeeze.dim
SqueezeDims = exir_ops.edge.aten.squeeze.dims
+SubTensor = exir_ops.edge.aten.sub.Tensor
Unsqueeze = exir_ops.edge.aten.unsqueeze.default
UpsampleBilinear2D = exir_ops.edge.aten.upsample_bilinear2d.vec
UpsampleNearest2D = exir_ops.edge.aten.upsample_nearest2d.vec
From 03e14ef8b3964deb589f3f172b4bbee7d206795a Mon Sep 17 00:00:00 2001
From: Youngsik Yang
Date: Tue, 26 May 2026 01:55:50 +0900
Subject: [PATCH 11/91] Arm backend: Add bf16 support for aten.index_select and
aten.unfold_copy (#19751)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Follow-up to #17097, which added BF16 support to the TOSA GATHER op.
`aten.index_select` and `aten.unfold_copy` both lower via TOSA GATHER
but their support checks were not updated at the time.
In both decompositions(`DecomposeIndexSelectToGatherPass()` and
`DecomposeUnfoldToGatherPass()`),
the bf16 values tensor flows through dtype-agnostic reshape ops and
`tosa.GATHER`, which accepts `BF16`.
The support check was the only blocker.
| Op | bf16 before | bf16 after |
|---------------------|:-----------:|:----------:|
| `aten.gather` | ✅ | ✅ |
| `aten.index.Tensor` | ✅ | ✅ |
| `aten.slice_copy` | ✅ | ✅ |
| `aten.index_select` | ❌ | ✅ |
| `aten.unfold_copy` | ❌ | ✅ |
Changes:
- `index_select_support.py`, `unfold_copy_support.py`: extend float
branch
to include `bfloat16`; add bf16 extension guard; update rejection
message.
- `test_index_select.py`, `test_unfold_copy.py`: add isolated
`_tosa_FP_bf16` test functions using
`TosaPipelineFP(..., tosa_extensions=["bf16"])`.
### Test plan
`test_index_select_tosa_FP_bf16` and `test_unfold_copy_tosa_FP_bf16`
exercise the bf16 path end-to-end through `TosaPipelineFP` with the bf16
extension enabled, following the same pattern of the existing
`test_slice_tensor_tosa_FP_bf16` from #17492
---
.../operator_support/index_select_support.py | 14 ++++++--
.../operator_support/unfold_copy_support.py | 14 ++++++--
backends/arm/test/ops/test_index_select.py | 32 +++++++++++++++++++
backends/arm/test/ops/test_unfold_copy.py | 24 ++++++++++++++
4 files changed, 78 insertions(+), 6 deletions(-)
diff --git a/backends/arm/operator_support/index_select_support.py b/backends/arm/operator_support/index_select_support.py
index a3188e739c7..285b2cfe79f 100644
--- a/backends/arm/operator_support/index_select_support.py
+++ b/backends/arm/operator_support/index_select_support.py
@@ -77,8 +77,16 @@ def is_node_tosa_supported(
f"{node.target}: dtype {values_dtype} requires INT profile.",
)
return False
- # fp16/fp32: either FP profile, or INT profile (via quantization)
- elif values_dtype in (torch.float16, torch.float32):
+ # fp16/fp32/bf16: either FP profile, or INT profile (via quantization)
+ elif values_dtype in (torch.float16, torch.float32, torch.bfloat16):
+ if values_dtype == torch.bfloat16 and not tosa_spec.support_extension(
+ "bf16"
+ ):
+ self.reporter.report_reject(
+ node,
+ f"{node.target}: dtype {values_dtype} requires bf16 extension.",
+ )
+ return False
if not (tosa_spec.support_float() or tosa_spec.support_integer()):
self.reporter.report_reject(
node,
@@ -90,7 +98,7 @@ def is_node_tosa_supported(
self.reporter.report_reject(
node,
f"{node.target}: unsupported values dtype {values_dtype}; "
- "expected bool/int8/int16/int32/float16/float32.",
+ "expected bool/int8/int16/int32/float16/bfloat16/float32.",
)
return False
diff --git a/backends/arm/operator_support/unfold_copy_support.py b/backends/arm/operator_support/unfold_copy_support.py
index bf6c1cad22e..ac9fc7d0ee3 100644
--- a/backends/arm/operator_support/unfold_copy_support.py
+++ b/backends/arm/operator_support/unfold_copy_support.py
@@ -84,8 +84,16 @@ def is_node_tosa_supported(
f"{node.target}: dtype {values_dtype} requires INT profile.",
)
return False
- # fp16/fp32: either FP profile, or INT profile (via quantization)
- elif values_dtype in (torch.float16, torch.float32):
+ # fp16/fp32/bf16: either FP profile, or INT profile (via quantization)
+ elif values_dtype in (torch.float16, torch.float32, torch.bfloat16):
+ if values_dtype == torch.bfloat16 and not tosa_spec.support_extension(
+ "bf16"
+ ):
+ self.reporter.report_reject(
+ node,
+ f"{node.target}: dtype {values_dtype} requires bf16 extension.",
+ )
+ return False
if not (tosa_spec.support_float() or tosa_spec.support_integer()):
self.reporter.report_reject(
node,
@@ -97,7 +105,7 @@ def is_node_tosa_supported(
self.reporter.report_reject(
node,
f"{node.target}: unsupported values dtype {values_dtype}; "
- "expected bool/int8/int16/int32/float16/float32.",
+ "expected bool/int8/int16/int32/float16/bfloat16/float32.",
)
return False
diff --git a/backends/arm/test/ops/test_index_select.py b/backends/arm/test/ops/test_index_select.py
index bb5f0a92c51..4de19d30daf 100644
--- a/backends/arm/test/ops/test_index_select.py
+++ b/backends/arm/test/ops/test_index_select.py
@@ -61,6 +61,26 @@ def forward(self, input_: torch.Tensor, dim: int, index_: torch.Tensor):
torch.tensor([3, 1], dtype=torch.int32), # [W=2]
),
}
+test_data_fp_bf16: dict[str, input_params] = {
+ # Rank-2: [K, C] -> index_select dim=0 => [W, C]
+ "test_bf16_rank2_dim0": (
+ torch.tensor(
+ [[0.5, 1.25, 2.5], [3.5, 4.25, 5.75], [6.5, 7.25, 8.75]],
+ dtype=torch.bfloat16,
+ ), # [K=3, C=3]
+ 0,
+ torch.tensor([2, 0], dtype=torch.int32), # [W=2]
+ ),
+ # Rank-3: [N, K, C] -> index_select dim=-1 => [N, K, W]
+ "test_bf16_rank3_dim_neg1": (
+ torch.tensor(
+ [[[0.5, 1.5], [2.5, 3.5]], [[4.5, 5.5], [6.5, 7.5]]],
+ dtype=torch.bfloat16,
+ ), # [N=2, K=2, C=2]
+ -1,
+ torch.tensor([1, 0], dtype=torch.int32), # [W=2]
+ ),
+}
# ---- INT profile: integer inputs + bool ----
test_data_int: dict[str, input_params] = {
@@ -104,6 +124,18 @@ def test_index_select_tosa_FP(test_data: input_params):
pipeline.run()
+@common.parametrize("test_data", test_data_fp_bf16)
+def test_index_select_tosa_FP_bf16(test_data: input_params):
+ pipeline = TosaPipelineFP[input_params](
+ IndexSelect(),
+ test_data,
+ aten_op=IndexSelect.aten_op,
+ exir_op=IndexSelect.exir_op,
+ tosa_extensions=["bf16"],
+ )
+ pipeline.run()
+
+
@common.parametrize("test_data", test_data_int | test_data_fp)
def test_index_select_tosa_INT(test_data: input_params):
# INT profile runs quantized, so we test both int inputs and float inputs here.
diff --git a/backends/arm/test/ops/test_unfold_copy.py b/backends/arm/test/ops/test_unfold_copy.py
index 2b502a9be10..baa4b7f64bc 100644
--- a/backends/arm/test/ops/test_unfold_copy.py
+++ b/backends/arm/test/ops/test_unfold_copy.py
@@ -120,6 +120,18 @@ def forward(self, input_: torch.Tensor, dim_: int, size_: int, step_: int):
),
}
+test_data_bf16: dict[str, input_params] = {
+ "test_bf16_2d_dim1": (
+ torch.tensor(
+ [[0.1, 0.2, 0.3, 0.4, 0.5], [1.1, 1.2, 1.3, 1.4, 1.5]],
+ dtype=torch.bfloat16,
+ ), # [B=2, T=5]
+ 1,
+ 3,
+ 2, # U=(5-3)//2+1=2 -> [B=2, U=2, C=3]
+ ),
+}
+
@common.parametrize("test_data", test_data_fp)
def test_unfold_copy_tosa_FP(test_data: input_params):
@@ -132,6 +144,18 @@ def test_unfold_copy_tosa_FP(test_data: input_params):
pipeline.run()
+@common.parametrize("test_data", test_data_bf16)
+def test_unfold_copy_tosa_FP_bf16(test_data: input_params):
+ pipeline = TosaPipelineFP[input_params](
+ UnfoldCopy(),
+ test_data,
+ aten_op=UnfoldCopy.aten_op,
+ exir_op=UnfoldCopy.exir_op,
+ tosa_extensions=["bf16"],
+ )
+ pipeline.run()
+
+
@common.parametrize("test_data", test_data_int | test_data_fp)
def test_unfold_copy_tosa_INT(test_data: input_params):
pipeline = TosaPipelineINT[input_params](
From b581615fa86dd2357d866064427a0b93b2ad947f Mon Sep 17 00:00:00 2001
From: Erik Lundell
Date: Tue, 26 May 2026 09:50:10 +0200
Subject: [PATCH 12/91] Cortex-M backend: Add AoT scratch-buffer planning.
(#19636)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This is done for conv, depthwise conv, transpose conv, and bmm.
Add scratch tensors to the operator signatures, which are then
assigned exir.memory.alloc. These allocs are automatically memory
planned by ExecuTorch.
Introduce `required_cmsis_buffer_size`which computes the buffer
size from node properties + the Cortex-M configuration.
The function uses functions registered by target in
backends/cortex_m/passes/scratch_buffer_sizes.py
This is used to set the size of the allocs in ConvertToCortexMPass
Finally, modify the kernels to use the new scratch tensor instead
of allocating temporary memory. Add a new macro
CORTEX_M_ENABLE_RUNTIME_CHECKS
to do a safety check that the aot computed buffer size is equal to the
buffer size computed at runtime. Use this when testing.
cc @psiddh @AdrianLundell @digantdesai @rascani @freddan80 @per @zingo
@oscarandersson8218 @mansnils @Sebastian-Larsson @robell
---------
Signed-off-by: Erik Lundell
Co-authored-by: Måns Nilsson
---
backends/arm/scripts/build_executorch.sh | 8 +
backends/cortex_m/CMakeLists.txt | 9 +
.../ops/op_quantized_batch_matmul.cpp | 35 +--
backends/cortex_m/ops/op_quantized_conv2d.cpp | 34 +--
.../ops/op_quantized_depthwise_conv2d.cpp | 31 +-
.../ops/op_quantized_transpose_conv2d.cpp | 44 +--
backends/cortex_m/ops/operators.py | 28 +-
backends/cortex_m/ops/operators.yaml | 9 +-
backends/cortex_m/passes/__init__.py | 1 +
.../passes/convert_to_cortex_m_pass.py | 64 ++++-
.../cortex_m/passes/scratch_buffer_sizes.py | 266 ++++++++++++++++++
backends/cortex_m/test/build_test_runner.sh | 4 +-
12 files changed, 451 insertions(+), 82 deletions(-)
create mode 100644 backends/cortex_m/passes/scratch_buffer_sizes.py
diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh
index 54d2091d1f4..5ac2674f964 100755
--- a/backends/arm/scripts/build_executorch.sh
+++ b/backends/arm/scripts/build_executorch.sh
@@ -7,6 +7,7 @@
# Optional parameter:
# --build_type= "Release" | "Debug" | "RelWithDebInfo" | "UndefinedSanitizer" | "AddressSanitizer"
# --etdump build with devtools-etdump support
+# --cmake-args= Additional arguments passed to cmake configure
set -eu
@@ -24,6 +25,7 @@ build_type="Release"
build_devtools=OFF
build_with_etdump=OFF
is_linux_musl=0
+extra_cmake_args=()
target_cpu=""
help() {
@@ -33,6 +35,7 @@ help() {
echo " --build_type= Build with Release, Debug, RelWithDebInfo, UndefinedSanitizer or AddressSanitizer, default is ${build_type}"
echo " --devtools Build Devtools libs"
echo " --etdump Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+ echo " --cmake-args= Additional arguments passed to cmake configure"
echo " --toolchain= Toolchain can be specified (arm-none-eabi-gcc, arm-zephyr-eabi-gcc, aarch64-linux-musl-gcc). Default: ${toolchain}"
echo " --target_cpu= Override the toolchain's default TARGET_CPU (e.g. cortex-m4). Switching target_cpu reuses the same cmake-out dir, so clear ${et_build_root}/cmake-out first to avoid stale per-CPU artifacts. Default: unset (toolchain default)."
exit 0
@@ -45,6 +48,10 @@ for arg in "$@"; do
--build_type=*) build_type="${arg#*=}";;
--devtools) build_devtools=ON ;;
--etdump) build_with_etdump=ON ;;
+ --cmake-args=*)
+ # shellcheck disable=SC2206
+ extra_cmake_args=(${arg#*=})
+ ;;
--toolchain=*) toolchain="${arg#*=}";;
--target_cpu=*) target_cpu="${arg#*=}";;
*)
@@ -89,6 +96,7 @@ cmake_args=(
-DEXECUTORCH_BUILD_DEVTOOLS=${build_devtools}
-DEXECUTORCH_BUILD_ARM_ETDUMP=${build_with_etdump}
-DEXECUTORCH_BAREMETAL_SKIP_INSTALL=OFF
+ "${extra_cmake_args[@]}"
)
if [[ -n "${target_cpu}" ]]; then
diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
index 876c65982e6..627406c1935 100644
--- a/backends/cortex_m/CMakeLists.txt
+++ b/backends/cortex_m/CMakeLists.txt
@@ -30,6 +30,10 @@ set(CMSIS_NN_LOCAL_PATH
""
CACHE PATH "Path to existing local CMSIS-NN installation"
)
+option(CORTEX_M_ENABLE_RUNTIME_CHECKS
+ "Enable additional Cortex-M runtime assertions and validation checks"
+ OFF
+)
# Try to find existing / local CMSIS-NN installation. This is useful for
# debugging and testing with local changes. This is not common, as the CMSIS-NN
@@ -107,6 +111,11 @@ target_link_libraries(
PRIVATE executorch
PRIVATE kernels_util_all_deps
)
+target_compile_definitions(
+ cortex_m_kernels
+ PRIVATE
+ $<$:CORTEX_M_ENABLE_RUNTIME_CHECKS>
+)
# Include directories for cortex_m_kernels
target_include_directories(
diff --git a/backends/cortex_m/ops/op_quantized_batch_matmul.cpp b/backends/cortex_m/ops/op_quantized_batch_matmul.cpp
index e6bc5a949ce..345753ca8fc 100644
--- a/backends/cortex_m/ops/op_quantized_batch_matmul.cpp
+++ b/backends/cortex_m/ops/op_quantized_batch_matmul.cpp
@@ -1,6 +1,7 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
+ * Copyright 2026 Arm Limited and/or its affiliates.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
@@ -71,6 +72,7 @@ Tensor& quantized_batch_matmul_out(
int64_t output_offset,
int64_t output_multiplier,
int64_t output_shift,
+ const Tensor& scratch,
Tensor& out) {
if (!validate_batch_matmul_arguments(context, lhs, rhs_transposed, out)) {
return out;
@@ -100,25 +102,26 @@ Tensor& quantized_batch_matmul_out(
quant_params.multiplier = static_cast(output_multiplier);
quant_params.shift = static_cast(output_shift);
- const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&out_dims);
-
cmsis_nn_context ctx;
ctx.buf = nullptr;
- ctx.size = 0;
-
- if (buf_size > 0) {
- auto buffer_or_error = context.allocate_temp(buf_size);
- if (!buffer_or_error.ok()) {
- ET_LOG(
- Error,
- "quantized_batch_matmul: failed to allocate scratch buffer (%d bytes)",
- buf_size);
- context.fail(buffer_or_error.error());
- return out;
- }
- ctx.buf = buffer_or_error.get();
- ctx.size = buf_size;
+ ctx.size = scratch.nbytes();
+ if (ctx.size > 0) {
+ ctx.buf = scratch.mutable_data_ptr();
+ }
+
+#ifdef CORTEX_M_ENABLE_RUNTIME_CHECKS
+ const int32_t runtime_buffer_bytes =
+ arm_fully_connected_s8_get_buffer_size(&out_dims);
+ if (ctx.size != static_cast(runtime_buffer_bytes)) {
+ ET_LOG(
+ Error,
+ "quantized_batch_matmul: scratch buffer size incorrect - actual: (%d) needed: (%d)",
+ static_cast(ctx.size),
+ runtime_buffer_bytes);
+ context.fail(Error::Internal);
+ return out;
}
+#endif
const arm_cmsis_nn_status status = arm_batch_matmul_s8(
&ctx,
diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp
index 7d4433690f6..8af374c03f8 100644
--- a/backends/cortex_m/ops/op_quantized_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp
@@ -112,6 +112,7 @@ Tensor& quantized_conv2d_out(
const Tensor& requantize_shifts,
const int64_t activation_min,
const int64_t activation_max,
+ const Tensor& scratch,
Tensor& out) {
if (!validate_conv2d_arguments(
context,
@@ -182,31 +183,30 @@ Tensor& quantized_conv2d_out(
cmsis_nn_context cmsis_context;
cmsis_context.buf = nullptr;
- cmsis_context.size = 0;
+ cmsis_context.size = scratch.nbytes();
+ if (cmsis_context.size > 0) {
+ cmsis_context.buf = scratch.mutable_data_ptr();
+ }
- const int32_t buffer_bytes = arm_convolve_wrapper_s8_get_buffer_size(
+#ifdef CORTEX_M_ENABLE_RUNTIME_CHECKS
+ const int32_t runtime_buffer_bytes = arm_convolve_wrapper_s8_get_buffer_size(
&conv_params, &input_dims, &filter_dims, &output_dims);
- if (buffer_bytes < 0) {
+ if (runtime_buffer_bytes < 0) {
ET_LOG(
Error, "quantized_conv2d_out: CMSIS-NN buffer size calculation failed");
context.fail(Error::Internal);
return out;
}
- if (buffer_bytes > 0) {
- auto buffer_or_error =
- context.allocate_temp(buffer_bytes, kCortexMMveAlignment);
- if (!buffer_or_error.ok()) {
- ET_LOG(
- Error,
- "quantized_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)",
- static_cast(buffer_bytes),
- static_cast(buffer_or_error.error()));
- context.fail(buffer_or_error.error());
- return out;
- }
- cmsis_context.buf = buffer_or_error.get();
- cmsis_context.size = buffer_bytes;
+ if (scratch.nbytes() != static_cast(runtime_buffer_bytes)) {
+ ET_LOG(
+ Error,
+ "quantized_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)",
+ static_cast(scratch.nbytes()),
+ static_cast(runtime_buffer_bytes));
+ context.fail(Error::Internal);
+ return out;
}
+#endif
const arm_cmsis_nn_status status = arm_convolve_wrapper_s8(
&cmsis_context,
diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
index 8dec61e0af1..21d4f257501 100644
--- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
@@ -150,6 +150,7 @@ Tensor& quantized_depthwise_conv2d_out(
const Tensor& requantize_shifts,
const int64_t activation_min,
const int64_t activation_max,
+ const Tensor& scratch,
Tensor& out) {
if (!validate_depthwise_conv2d_arguments(
context,
@@ -220,32 +221,32 @@ Tensor& quantized_depthwise_conv2d_out(
cmsis_nn_context cmsis_context;
cmsis_context.buf = nullptr;
- cmsis_context.size = 0;
+ cmsis_context.size = scratch.nbytes();
+ if (cmsis_context.size > 0) {
+ cmsis_context.buf = scratch.mutable_data_ptr();
+ }
- const int32_t buffer_bytes = arm_depthwise_conv_wrapper_s8_get_buffer_size(
- &dw_conv_params, &input_dims, &filter_dims, &output_dims);
- if (buffer_bytes < 0) {
+#ifdef CORTEX_M_ENABLE_RUNTIME_CHECKS
+ const int32_t runtime_buffer_bytes =
+ arm_depthwise_conv_wrapper_s8_get_buffer_size(
+ &dw_conv_params, &input_dims, &filter_dims, &output_dims);
+ if (runtime_buffer_bytes < 0) {
ET_LOG(
Error,
"quantized_depthwise_conv2d_out: CMSIS-NN buffer size calculation failed");
context.fail(Error::Internal);
return out;
}
-
- auto buffer_or_error = context.allocate_temp(
- static_cast(buffer_bytes), kCortexMMveAlignment);
- if (!buffer_or_error.ok()) {
+ if (scratch.nbytes() != static_cast(runtime_buffer_bytes)) {
ET_LOG(
Error,
- "quantized_depthwise_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)",
- static_cast(buffer_bytes),
- static_cast(buffer_or_error.error()));
- context.fail(buffer_or_error.error());
+ "quantized_depthwise_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)",
+ static_cast(scratch.nbytes()),
+ static_cast(runtime_buffer_bytes));
+ context.fail(Error::Internal);
return out;
}
- cmsis_context.buf = buffer_or_error.get();
- cmsis_context.size = buffer_bytes;
-
+#endif
const arm_cmsis_nn_status status = arm_depthwise_conv_wrapper_s8(
&cmsis_context,
&dw_conv_params,
diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
index e3f6135c7b9..d2b66b18802 100644
--- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
@@ -1,6 +1,7 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
+ * Copyright 2026 Arm Limited and/or its affiliates.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
@@ -97,6 +98,8 @@ Tensor& quantized_transpose_conv2d_out(
const Tensor& requantize_shifts,
const int64_t activation_min,
const int64_t activation_max,
+ const Tensor& scratch,
+ const Tensor& output_scratch,
Tensor& out) {
if (!validate_transpose_conv2d_arguments(
context,
@@ -179,44 +182,43 @@ Tensor& quantized_transpose_conv2d_out(
cmsis_nn_context cmsis_context;
cmsis_context.buf = nullptr;
- cmsis_context.size = 0;
+ cmsis_context.size = scratch.nbytes();
+ if (cmsis_context.size > 0) {
+ cmsis_context.buf = scratch.mutable_data_ptr();
+ }
cmsis_nn_context output_context;
output_context.buf = nullptr;
- output_context.size = 0;
-
+ output_context.size = output_scratch.nbytes();
+ if (output_context.size > 0) {
+ output_context.buf = output_scratch.mutable_data_ptr();
+ }
+#ifdef CORTEX_M_ENABLE_RUNTIME_CHECKS
const int32_t buffer_bytes = arm_transpose_conv_s8_get_buffer_size(
&transpose_conv_params, &input_dims, &filter_dims, &output_dims);
- auto buffer_or_error = context.allocate_temp(
- static_cast(buffer_bytes), kCortexMMveAlignment);
- if (!buffer_or_error.ok()) {
+ if (scratch.nbytes() != static_cast(buffer_bytes)) {
ET_LOG(
Error,
- "quantized_transpose_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)",
- buffer_bytes,
- static_cast(buffer_or_error.error()));
- context.fail(buffer_or_error.error());
+ "quantized_transpose_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)",
+ static_cast(scratch.nbytes()),
+ buffer_bytes);
+ context.fail(Error::Internal);
return out;
}
- cmsis_context.buf = buffer_or_error.get();
- cmsis_context.size = buffer_bytes;
const int32_t output_buffer_bytes =
arm_transpose_conv_s8_get_reverse_conv_buffer_size(
&transpose_conv_params, &input_dims, &filter_dims);
- auto output_buffer_or_error = context.allocate_temp(
- static_cast(output_buffer_bytes), kCortexMMveAlignment);
- if (!output_buffer_or_error.ok()) {
+ if (output_scratch.nbytes() != static_cast(output_buffer_bytes)) {
ET_LOG(
Error,
- "quantized_transpose_conv2d_out: failed to allocate output scratch buffer (%d bytes, error %d)",
- output_buffer_bytes,
- static_cast(output_buffer_or_error.error()));
- context.fail(output_buffer_or_error.error());
+ "quantized_transpose_conv2d_out: output scratch buffer size incorrect - actual: (%d) needed: (%d)",
+ static_cast(output_scratch.nbytes()),
+ output_buffer_bytes);
+ context.fail(Error::Internal);
return out;
}
- output_context.buf = output_buffer_or_error.get();
- output_context.size = output_buffer_bytes;
+#endif
const arm_cmsis_nn_status status = arm_transpose_conv_wrapper_s8(
&cmsis_context,
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
index 2c35ed8730b..d4393bc7ada 100644
--- a/backends/cortex_m/ops/operators.py
+++ b/backends/cortex_m/ops/operators.py
@@ -271,13 +271,15 @@ def quantized_mul_impl(
"quantized_batch_matmul("
"Tensor lhs, int lhs_zero_point, "
"Tensor rhs_transposed, int rhs_zero_point, "
- "int output_zero_point, int output_multiplier, int output_shift) -> Tensor"
+ "int output_zero_point, int output_multiplier, int output_shift, "
+ "Tensor scratch) -> Tensor"
)
lib.define(
"quantized_batch_matmul.out("
"Tensor lhs, int lhs_zero_point, "
"Tensor rhs_transposed, int rhs_zero_point, "
"int output_zero_point, int output_multiplier, int output_shift, "
+ "Tensor scratch, "
"*, Tensor(a!) out) -> Tensor(a!)"
)
@@ -291,6 +293,7 @@ def quantized_batch_matmul_meta(
output_zero_point: int,
output_multiplier: int,
output_shift: int,
+ scratch: torch.Tensor,
) -> torch.Tensor:
batch, lhs_rows, inner = lhs.shape
batch_rhs, rhs_cols, inner_rhs = rhs_transposed.shape
@@ -307,6 +310,7 @@ def quantized_batch_matmul_impl(
output_zero_point: int,
output_multiplier: int,
output_shift: int,
+ scratch: torch.Tensor,
) -> torch.Tensor:
# Offsets are negated zero points (CMSIS-NN convention)
lhs_fp = lhs.to(torch.float32) + float(lhs_zero_point)
@@ -638,7 +642,8 @@ def pad_impl(
"Tensor requantize_multipliers, "
"Tensor requantize_shifts, "
"int activation_min, "
- "int activation_max"
+ "int activation_max, "
+ "Tensor scratch"
") -> Tensor"
)
@@ -657,6 +662,7 @@ def pad_impl(
"Tensor requantize_shifts, "
"int activation_min, "
"int activation_max, "
+ "Tensor scratch, "
"*, Tensor(a!) out"
") -> Tensor(a!)"
)
@@ -733,6 +739,7 @@ def quantized_conv2d_meta(
requantize_shifts: torch.Tensor,
activation_min: int,
activation_max: int,
+ scratch: torch.Tensor,
) -> torch.Tensor:
stride_vals = list(stride)
padding_vals = list(padding)
@@ -762,6 +769,7 @@ def quantized_conv2d_impl(
requantize_shifts: torch.Tensor,
activation_min: int,
activation_max: int,
+ scratch: torch.Tensor,
) -> torch.Tensor:
if input.dim() != 4 or weight.dim() != 4:
raise RuntimeError("quantized_conv2d expects 4D input and weight tensors")
@@ -830,7 +838,8 @@ def quantized_conv2d_impl(
"Tensor requantize_multipliers, "
"Tensor requantize_shifts, "
"int activation_min, "
- "int activation_max"
+ "int activation_max, "
+ "Tensor scratch"
") -> Tensor"
)
@@ -850,6 +859,7 @@ def quantized_conv2d_impl(
"Tensor requantize_shifts, "
"int activation_min, "
"int activation_max, "
+ "Tensor scratch, "
"*, Tensor(a!) out"
") -> Tensor(a!)"
)
@@ -870,6 +880,7 @@ def quantized_depthwise_conv2d_meta(
requantize_shifts: torch.Tensor,
activation_min: int,
activation_max: int,
+ scratch: torch.Tensor,
) -> torch.Tensor:
stride_vals = list(stride)
padding_vals = list(padding)
@@ -900,6 +911,7 @@ def quantized_depthwise_conv2d_impl(
requantize_shifts: torch.Tensor,
activation_min: int,
activation_max: int,
+ scratch: torch.Tensor,
) -> torch.Tensor:
if input.dim() != 4 or weight.dim() != 4:
raise RuntimeError(
@@ -973,7 +985,9 @@ def quantized_depthwise_conv2d_impl(
"Tensor requantize_multipliers, "
"Tensor requantize_shifts, "
"int activation_min, "
- "int activation_max"
+ "int activation_max, "
+ "Tensor scratch, "
+ "Tensor output_scratch"
") -> Tensor"
)
@@ -992,6 +1006,8 @@ def quantized_depthwise_conv2d_impl(
"Tensor requantize_shifts, "
"int activation_min, "
"int activation_max, "
+ "Tensor scratch, "
+ "Tensor output_scratch, "
"*, Tensor(a!) out) -> Tensor(a!)"
)
@@ -1057,6 +1073,8 @@ def quantized_transpose_conv2d_meta(
requantize_shifts: torch.Tensor,
activation_min: int,
activation_max: int,
+ scratch: torch.Tensor,
+ output_scratch: torch.Tensor,
) -> torch.Tensor:
stride_vals = list(stride)
padding_vals = list(padding)
@@ -1095,6 +1113,8 @@ def quantized_transpose_conv2d_impl(
requantize_shifts: torch.Tensor,
activation_min: int,
activation_max: int,
+ scratch: torch.Tensor,
+ output_scratch: torch.Tensor,
) -> torch.Tensor:
"""
Reference implementation of quantized transposed convolution.
diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml
index e0ebbfab868..8db109dea43 100644
--- a/backends/cortex_m/ops/operators.yaml
+++ b/backends/cortex_m/ops/operators.yaml
@@ -65,19 +65,20 @@
- arg_meta: null
kernel_name: cortex_m::pad_out
-- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!)
+- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, Tensor scratch, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
- arg_meta: null
kernel_name: cortex_m::quantized_conv2d_out
-- func: cortex_m::quantized_depthwise_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int depth_multiplier, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: cortex_m::quantized_depthwise_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int depth_multiplier, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, Tensor scratch, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
- arg_meta: null
kernel_name: cortex_m::quantized_depthwise_conv2d_out
-- func: cortex_m::quantized_transpose_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!)
+- func: cortex_m::quantized_transpose_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, Tensor scratch, Tensor output_scratch, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
- arg_meta: null
@@ -94,7 +95,7 @@
- arg_meta: null
kernel_name: cortex_m::quantized_max_pool2d_out
-- func: cortex_m::quantized_batch_matmul.out(Tensor lhs, int lhs_zero_point, Tensor rhs_transposed, int rhs_zero_point, int output_zero_point, int output_multiplier, int output_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cortex_m::quantized_batch_matmul.out(Tensor lhs, int lhs_zero_point, Tensor rhs_transposed, int rhs_zero_point, int output_zero_point, int output_multiplier, int output_shift, Tensor scratch, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
- arg_meta: null
diff --git a/backends/cortex_m/passes/__init__.py b/backends/cortex_m/passes/__init__.py
index 92179ec6654..c379461949f 100644
--- a/backends/cortex_m/passes/__init__.py
+++ b/backends/cortex_m/passes/__init__.py
@@ -33,6 +33,7 @@ def _ensure_cortex_m_dependencies() -> None:
_ensure_cortex_m_dependencies()
+from .cortex_m_pass import CortexMPass # noqa # usort: skip
from .activation_fusion_pass import ActivationFusionPass # noqa
from .clamp_hardswish_pass import ClampHardswishPass # noqa
from .convert_to_cortex_m_pass import ConvertToCortexMPass # noqa
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
index 418f6cd63ff..e61ddaf63bc 100644
--- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py
+++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -6,25 +6,32 @@
# LICENSE file in the root directory of this source tree.
import executorch.backends.cortex_m.ops.operators # noqa
+import executorch.exir as exir
import torch
import torch.fx
from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+
+from executorch.backends.cortex_m.passes import CortexMPass
from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
+from executorch.backends.cortex_m.passes.scratch_buffer_sizes import (
+ required_cmsis_nn_buffer_sizes,
+)
from executorch.backends.transforms.utils import (
create_constant_placeholder,
get_param_tensor,
is_param_node,
)
-
-from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.passes import make_alloc_node
+from torch._subclasses.fake_tensor import FakeTensorMode
+
from torch.export.graph_signature import InputKind
from torch.fx.passes.infra.pass_manager import PassResult
-class ConvertToCortexMPass(XNNPACKPass):
+class ConvertToCortexMPass(CortexMPass):
"""
Cortex-M backend pass for replacing supported quantized kernels with Cortex-M
accelerated kernels.
@@ -33,6 +40,15 @@ class ConvertToCortexMPass(XNNPACKPass):
by call_operator.
"""
+ def _create_uninitialized_alloc_node(self):
+ """Create an unitialized alloc node to be initialize at a later point."""
+ with FakeTensorMode() as mode:
+ return make_alloc_node(
+ self.exported_program.graph_module,
+ mode.from_tensor(torch.empty(0)),
+ None,
+ )
+
def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset):
"""
Computes the precomputed kernel sum term (bias optional)
@@ -238,6 +254,9 @@ def _get_convolution_replacement(self, node):
torch.tensor(quantized_shifts, dtype=torch.int32),
)
+ with node.graph.inserting_before(node):
+ scratch = self._create_uninitialized_alloc_node()
+
if use_depthwise_conv:
# Compute depth_multiplier for depthwise convolution
# For depthwise: output_channels = input_channels * depth_multiplier
@@ -263,6 +282,7 @@ def _get_convolution_replacement(self, node):
quantized_shift_tensor,
output_qmin,
output_qmax,
+ scratch,
)
return exir_ops.edge.cortex_m.quantized_depthwise_conv2d.default, new_args
else:
@@ -280,9 +300,36 @@ def _get_convolution_replacement(self, node):
quantized_shift_tensor,
output_qmin,
output_qmax,
+ scratch,
)
return exir_ops.edge.cortex_m.quantized_conv2d.default, new_args
+ def _initialize_alloc_node_size(self, node: torch.fx.Node) -> None:
+ """For nodes with a registered buffer size function for node.target, set the buffer sizes
+ of the last n args, which should be exir.memory.alloc nodes. For nodes without a
+ registered function, do nothing.
+ """
+
+ scratch_buffer_sizes = required_cmsis_nn_buffer_sizes(
+ node, self.target_config.backend
+ )
+ if scratch_buffer_sizes is None:
+ return
+
+ # Assume that scratch_buffer_sizes are given from left to right in the call signature of node.target.
+ for i, scratch_buffer_size in enumerate(reversed(scratch_buffer_sizes)):
+ scratch_arg = node.args[-(i + 1)]
+ if (
+ not isinstance(scratch_arg, torch.fx.Node)
+ or scratch_arg.target != exir.memory.alloc
+ ):
+ raise RuntimeError(
+ f"Expected scratch alloc node as final argument(s) for {node.target}, got {scratch_arg}."
+ )
+
+ # buffer size is given in bytes, always use uint8 as dtype.
+ scratch_arg.args = (((scratch_buffer_size,), torch.uint8),)
+
def _get_transpose_conv2d_replacement(self, node):
"""
Transform aten.convolution with transposed=True to cortex_m.quantized_transpose_conv2d
@@ -363,6 +410,10 @@ def _get_transpose_conv2d_replacement(self, node):
torch.tensor(quantized_shifts, dtype=torch.int32),
)
+ with node.graph.inserting_before(node):
+ scratch = self._create_uninitialized_alloc_node()
+ output_scratch = self._create_uninitialized_alloc_node()
+
new_args = (
x,
weight_nhwc,
@@ -377,6 +428,8 @@ def _get_transpose_conv2d_replacement(self, node):
quantized_shift_tensor,
output_qmin,
output_qmax,
+ scratch,
+ output_scratch,
)
return exir_ops.edge.cortex_m.quantized_transpose_conv2d.default, new_args
@@ -415,6 +468,9 @@ def _get_bmm_replacement(self, node):
args=(rhs_node, [0, 2, 1]),
)
+ with node.graph.inserting_before(node):
+ scratch = self._create_uninitialized_alloc_node()
+
args = (
lhs_node,
-lhs_zp,
@@ -423,6 +479,7 @@ def _get_bmm_replacement(self, node):
output_zp,
output_mult,
output_shift,
+ scratch,
)
return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args
@@ -459,6 +516,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
args=args,
kwargs={},
)
+ self._initialize_alloc_node_size(cortex_m_op)
node.replace_all_uses_with(cortex_m_op)
graph_module.graph.erase_node(node)
diff --git a/backends/cortex_m/passes/scratch_buffer_sizes.py b/backends/cortex_m/passes/scratch_buffer_sizes.py
new file mode 100644
index 00000000000..36f3f8bbc17
--- /dev/null
+++ b/backends/cortex_m/passes/scratch_buffer_sizes.py
@@ -0,0 +1,266 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections.abc import Callable
+from typing import Any, cast
+
+import cmsis_nn # type: ignore[import-not-found, import-untyped]
+import executorch.backends.cortex_m.ops.operators # noqa
+
+import torch
+import torch.fx
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+BufferSizeFunction = Callable[[cmsis_nn.Backend, torch.fx.Node], list[int]]
+
+
+def _tensor_from_node(node: torch.fx.Node) -> torch.Tensor:
+ if "val" in node.meta:
+ return node.meta["val"]
+ elif node.op == "call_function":
+ args = (
+ _tensor_from_node(arg) if isinstance(arg, torch.fx.Node) else arg
+ for arg in node.args
+ )
+ return node.target(*args, **node.kwargs) # type: ignore[operator]
+ else:
+ raise RuntimeError("Encountered non-call_function without 'val' meta.")
+
+
+def _shape_from_node(node: torch.fx.Node) -> torch.Size:
+ return _tensor_from_node(node).shape
+
+
+def _get_common_conv_buffer_size_inputs(
+ conv_node: torch.fx.Node,
+ *,
+ stride_arg_idx: int = 3,
+ padding_arg_idx: int = 4,
+ dilation_arg_idx: int = 5,
+) -> tuple[
+ list[int],
+ list[int],
+ list[int],
+ list[int],
+ list[int],
+ list[int],
+]:
+ x = cast(torch.fx.Node, conv_node.args[0])
+ weight = cast(torch.fx.Node, conv_node.args[1])
+ stride = cast(list[int], conv_node.args[stride_arg_idx])
+ padding = cast(list[int], conv_node.args[padding_arg_idx])
+ dilation = cast(list[int], conv_node.args[dilation_arg_idx])
+
+ # Input is NCHW (PyTorch); CMSIS-NN wants NHWC dims.
+ n, c_in, height, width = _shape_from_node(x)
+
+ weight_shape = _shape_from_node(weight)
+
+ # Output is NCHW; convert to NHWC dims.
+ out_n, out_c, out_h, out_w = _shape_from_node(conv_node)
+
+ input_nhwc = [n, height, width, c_in]
+ output_nhwc = [out_n, out_h, out_w, out_c]
+ stride_hw = [int(stride[0]), int(stride[1])]
+ padding_hw = [int(padding[0]), int(padding[1])]
+ dilation_hw = [int(dilation[0]), int(dilation[1])]
+
+ return (
+ input_nhwc,
+ list(weight_shape),
+ output_nhwc,
+ stride_hw,
+ padding_hw,
+ dilation_hw,
+ )
+
+
+def cmsis_nn_conv_buffer_size(
+ backend: cmsis_nn.Backend,
+ conv_node: torch.fx.Node,
+) -> list[int]:
+ (
+ input_nhwc,
+ weight_shape,
+ output_nhwc,
+ stride_hw,
+ padding_hw,
+ dilation_hw,
+ ) = _get_common_conv_buffer_size_inputs(conv_node=conv_node)
+ input_offset = cast(int, conv_node.args[6])
+ output_offset = cast(int, conv_node.args[7])
+ output_qmin = cast(int, conv_node.args[10])
+ output_qmax = cast(int, conv_node.args[11])
+
+ # Weight is in OHWI layout after conversion.
+ c_out, kernel_h, kernel_w, c_in = weight_shape
+ filter_nhwc = [c_out, kernel_h, kernel_w, c_in]
+
+ return [
+ int(
+ cmsis_nn.convolve_wrapper_buffer_size(
+ backend,
+ cmsis_nn.DataType.A8W8,
+ input_nhwc=input_nhwc,
+ filter_nhwc=filter_nhwc,
+ output_nhwc=output_nhwc,
+ padding_hw=padding_hw,
+ stride_hw=stride_hw,
+ dilation_hw=dilation_hw,
+ input_offset=input_offset,
+ output_offset=output_offset,
+ activation_min=output_qmin,
+ activation_max=output_qmax,
+ )
+ )
+ ]
+
+
+def cmsis_nn_depthwise_conv_buffer_size(
+ backend: cmsis_nn.Backend,
+ conv_node: torch.fx.Node,
+) -> list[int]:
+ (
+ input_nhwc,
+ weight_shape,
+ output_nhwc,
+ stride_hw,
+ padding_hw,
+ dilation_hw,
+ ) = _get_common_conv_buffer_size_inputs(conv_node=conv_node)
+ depth_multiplier = cast(int, conv_node.args[6])
+ input_offset = cast(int, conv_node.args[7])
+ output_offset = cast(int, conv_node.args[8])
+ output_qmin = cast(int, conv_node.args[11])
+ output_qmax = cast(int, conv_node.args[12])
+
+ # Weight is in IHWO layout after conversion.
+ _, kernel_h, kernel_w, c_out = weight_shape
+ filter_nhwc = [c_out, kernel_h, kernel_w, 1]
+
+ return [
+ int(
+ cmsis_nn.depthwise_conv_wrapper_buffer_size(
+ backend,
+ cmsis_nn.DataType.A8W8,
+ input_nhwc=input_nhwc,
+ filter_nhwc=filter_nhwc,
+ output_nhwc=output_nhwc,
+ padding_hw=padding_hw,
+ stride_hw=stride_hw,
+ dilation_hw=dilation_hw,
+ ch_mult=depth_multiplier,
+ input_offset=input_offset,
+ output_offset=output_offset,
+ activation_min=output_qmin,
+ activation_max=output_qmax,
+ )
+ )
+ ]
+
+
+def cmsis_nn_batch_matmul_buffer_size(
+ backend: cmsis_nn.Backend,
+ matmul_node: torch.fx.Node,
+) -> list[int]:
+ rhs_transposed = cast(torch.fx.Node, matmul_node.args[2])
+ rhs_shape = _shape_from_node(rhs_transposed)
+
+ _, rhs_cols, inner = rhs_shape
+
+ return [
+ int(
+ cmsis_nn.fully_connected_buffer_size(
+ backend,
+ cmsis_nn.DataType.A8W8,
+ filter_nhwc=[inner, -1, -1, rhs_cols], # H and W values are unused.
+ )
+ )
+ ]
+
+
+def cmsis_nn_transpose_conv_buffer_size(
+ backend: cmsis_nn.Backend,
+ conv_node: torch.fx.Node,
+) -> list[int]:
+ (
+ input_nhwc,
+ weight_shape,
+ output_nhwc,
+ stride_hw,
+ padding_hw,
+ dilation_hw,
+ ) = _get_common_conv_buffer_size_inputs(
+ conv_node=conv_node,
+ stride_arg_idx=3,
+ padding_arg_idx=4,
+ dilation_arg_idx=6,
+ )
+ output_padding = cast(list[int], conv_node.args[5])
+ input_offset = cast(int, conv_node.args[7])
+ output_offset = cast(int, conv_node.args[8])
+ output_qmin = cast(int, conv_node.args[11])
+ output_qmax = cast(int, conv_node.args[12])
+ c_out, kernel_h, kernel_w, kernel_c_in = weight_shape
+ filter_nhwc = [c_out, kernel_h, kernel_w, kernel_c_in]
+ padding_offsets_hw = [int(output_padding[0]), int(output_padding[1])]
+
+ return [
+ int(
+ cmsis_nn.transpose_conv_buffer_size(
+ backend,
+ cmsis_nn.DataType.A8W8,
+ input_nhwc=input_nhwc,
+ filter_nhwc=filter_nhwc,
+ output_nhwc=output_nhwc,
+ padding_hw=padding_hw,
+ stride_hw=stride_hw,
+ dilation_hw=dilation_hw,
+ padding_offsets_hw=padding_offsets_hw,
+ input_offset=input_offset,
+ output_offset=output_offset,
+ activation_min=output_qmin,
+ activation_max=output_qmax,
+ )
+ ),
+ int(
+ cmsis_nn.transpose_conv_reverse_conv_buffer_size(
+ backend,
+ cmsis_nn.DataType.A8W8,
+ input_nhwc=input_nhwc,
+ filter_nhwc=filter_nhwc,
+ padding_hw=padding_hw,
+ stride_hw=stride_hw,
+ dilation_hw=dilation_hw,
+ padding_offsets_hw=padding_offsets_hw,
+ input_offset=input_offset,
+ output_offset=output_offset,
+ activation_min=output_qmin,
+ activation_max=output_qmax,
+ )
+ ),
+ ]
+
+
+_target_to_buffer_sizes_registry: dict[Any, BufferSizeFunction] = {
+ exir_ops.edge.cortex_m.quantized_conv2d.default: cmsis_nn_conv_buffer_size,
+ exir_ops.edge.cortex_m.quantized_depthwise_conv2d.default: cmsis_nn_depthwise_conv_buffer_size,
+ exir_ops.edge.cortex_m.quantized_batch_matmul.default: cmsis_nn_batch_matmul_buffer_size,
+ exir_ops.edge.cortex_m.quantized_transpose_conv2d.default: cmsis_nn_transpose_conv_buffer_size,
+}
+
+
+def required_cmsis_nn_buffer_sizes(
+ node: torch.fx.Node, backend: cmsis_nn.Backend
+) -> list[int] | None:
+ """Returns a sequence of scratch buffer sizes required by node, in bytes.
+ If no function is registered to compute this for the target of the node, return None.
+ """
+ if node.target not in _target_to_buffer_sizes_registry:
+ return None
+
+ buffer_size_function = _target_to_buffer_sizes_registry[node.target]
+ return buffer_size_function(backend, node)
diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh
index bdca1a21e7c..a67c5a907a4 100755
--- a/backends/cortex_m/test/build_test_runner.sh
+++ b/backends/cortex_m/test/build_test_runner.sh
@@ -28,7 +28,7 @@ fi
script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
et_root_dir=$(realpath "${script_dir}/../../..")
build_executorch="${et_root_dir}/backends/arm/scripts/build_executorch.sh"
-${build_executorch} --devtools --target_cpu="${target_cpu}"
+${build_executorch} --devtools --target_cpu="${target_cpu}" --cmake-args="-DCORTEX_M_ENABLE_RUNTIME_CHECKS=ON"
# Build executor runner with selected aten ops and semi hosting
build_dir="${et_root_dir}/arm_test"
@@ -48,4 +48,4 @@ aten::unsqueeze_copy.out,\
aten::select_copy.int_out,\
aten::amax.out"
-${build_executor_runner} --pte=semihosting --bundleio --target="${target}" --output="${build_root_test_dir}" --select_ops_list="${select_ops_list}" --extra_build_flags="-DET_ATOL=5.0 -DET_RTOL=1.0"
+${build_executor_runner} --pte=semihosting --bundleio --target="${target}" --output="${build_root_test_dir}" --select_ops_list="${select_ops_list}" --extra_build_flags="-DET_ATOL=5.0 -DET_RTOL=1.0 -DET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=0"
From 5fc929fa88e3b76c7ef26a482c896b344054ef48 Mon Sep 17 00:00:00 2001
From: qti-chenweng <168707118+chenweng-quic@users.noreply.github.com>
Date: Tue, 26 May 2026 16:55:09 +0800
Subject: [PATCH 13/91] Qualcomm AI Engine Direct - Refactor llama runner for
dynamic IO dtypes (#19146)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
### Summary
To enable GPU backend support in the Llama runner, refactoring is
required because the dtypes of kv_cache, attention_mask, and logits are
currently hardcoded, preventing floating‑point models from running.
This PR focuses on removing the hardcode dtype for them.
#### Key changes
- Remove template parameter from KVManager,
LhdTokenGenerator,
MultimodalPromptProcessor, and related runner classes
- Detect kv_cache and attention_mask dtypes dynamically from MethodMeta
at
construction time instead of compile-time bitwidth detection
- Switch to std::byte* pointer arithmetic with getDtypeSize() for all
buffer
offsets; add fill_mask() helper for multi-dtype attention mask filling
- Update spec_prop pass for custom llama op for sharding case greater
than 1
### Test plan
```
python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder /local/mnt/workspace/chenweng/executorch/executorch/build-android --device acfa9311 --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --use_fp16
```
cc @cccclai @cbilgin @abhinaykukkadapu
---
backends/qualcomm/_passes/build_quant_io.py | 48 +--
backends/qualcomm/tests/test_qnn_delegate.py | 18 +-
backends/qualcomm/tests/utils.py | 1 +
.../stories260k_hybrid_llama_qnn.pte | Bin 1355520 -> 1350272 bytes
.../llama/decoder_runtime_evaluator.py | 2 +-
.../oss_scripts/llama/decoder_utils.py | 6 +-
examples/qualcomm/oss_scripts/llama/llama.py | 70 +++-
.../oss_scripts/llama/qnn_llama_runner.cpp | 25 +-
.../llama/qnn_multimodal_runner.cpp | 38 +-
.../oss_scripts/llama/runner/decoder_runner.h | 28 +-
.../oss_scripts/llama/runner/kv_manager.cpp | 366 +++++++++++-------
.../oss_scripts/llama/runner/kv_manager.h | 43 +-
.../llama/runner/lhd_token_generator.cpp | 29 +-
.../llama/runner/lhd_token_generator.h | 18 +-
.../multimodal_lhd_token_generator.cpp | 26 +-
.../multimodal_lhd_token_generator.h | 18 +-
.../multimodal_prompt_processor.cpp | 53 ++-
.../multimodal_prompt_processor.h | 51 ++-
.../multimodal_runner/multimodal_runner.cpp | 73 ++--
.../multimodal_runner/multimodal_runner.h | 12 +-
.../multimodal_token_generator.cpp | 50 +--
.../multimodal_token_generator.h | 43 +-
.../llama/runner/prompt_processor.cpp | 84 ++--
.../llama/runner/prompt_processor.h | 30 +-
.../oss_scripts/llama/runner/runner.cpp | 71 ++--
.../oss_scripts/llama/runner/runner.h | 13 +-
.../llama/runner/token_generator.cpp | 80 ++--
.../llama/runner/token_generator.h | 30 +-
.../qualcomm/oss_scripts/llama/runner/utils.h | 41 ++
.../llama/wrappers/attention_sink_wrappers.py | 2 +
.../llama/wrappers/llm_wrappers.py | 46 ++-
exir/passes/spec_prop_pass.py | 15 +-
extension/android/jni/jni_layer_llama.cpp | 43 +-
extension/llm/custom_ops/model_sharding.py | 24 +-
extension/llm/custom_ops/op_fallback.py | 29 ++
35 files changed, 820 insertions(+), 706 deletions(-)
create mode 100644 extension/llm/custom_ops/op_fallback.py
diff --git a/backends/qualcomm/_passes/build_quant_io.py b/backends/qualcomm/_passes/build_quant_io.py
index d43842e84a5..057dcc0f864 100644
--- a/backends/qualcomm/_passes/build_quant_io.py
+++ b/backends/qualcomm/_passes/build_quant_io.py
@@ -5,11 +5,10 @@
# LICENSE file in the root directory of this source tree.
import torch
from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO
-from executorch.exir.delegate import executorch_call_delegate
-from executorch.exir.pass_base import ExportPass, ProxyValue
+from executorch.exir.delegate import executorch_call_delegate
+from executorch.exir.pass_base import ExportPass, PassResult
from executorch.exir.tensor import TensorSpec
-from torch.utils import _pytree as pytree
class BuildQuantIo(ExportPass):
@@ -28,22 +27,27 @@ def _make_spec(self, x):
else:
return None
- def placeholder(self, name: str, arg, meta):
- if quantized_dtype := meta.data.get(QCOM_QUANTIZED_IO, None):
- arg = arg.to(dtype=quantized_dtype)
- meta["spec"] = self._make_spec(arg)
- return super().placeholder(name, arg, meta)
-
- def call_getitem(self, value, key: int, meta):
- meta["spec"] = value.node.meta["spec"][key]
- return super().call_getitem(value, key, meta)
-
- def call_delegate(self, lowered_module, args, kwargs, meta):
- args_data, _ = pytree.tree_map_only(
- ProxyValue, lambda x: x.data, (args, kwargs)
- )
- meta["spec"] = pytree.tree_map(
- self._make_spec,
- executorch_call_delegate(lowered_module, *args_data),
- )
- return super().call_delegate(lowered_module, args, kwargs, meta)
+ def _build(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+ # Forcedly update delegate node's meta['spec'] to get correct output
+ # tensor size in runtime
+ call_delegates = [
+ node
+ for node in graph_module.graph.nodes
+ if node.op == "call_function" and node.target == executorch_call_delegate
+ ]
+ for n in graph_module.graph.nodes:
+ if QCOM_QUANTIZED_IO in n.meta:
+ n.meta["val"] = n.meta["val"].to(dtype=n.meta[QCOM_QUANTIZED_IO])
+ n.meta["spec"] = self._make_spec(n.meta["val"])
+
+ for call_delegate in call_delegates:
+ spec = []
+ for user in list(call_delegate.users):
+ spec.append(self._make_spec(user.meta["val"]))
+ call_delegate.meta["spec"] = tuple(spec)
+
+ def call(self, graph_module: torch.fx.GraphModule):
+ self._build(graph_module)
+ graph_module.graph.eliminate_dead_code()
+ graph_module.recompile()
+ return PassResult(graph_module, True)
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 6d5b44d7a35..ee6678fa499 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -7730,8 +7730,11 @@ def test_llama_stories_110m(self):
"--max_context_len",
"128",
]
+ if self.use_fp16:
+ cmds.append("--use_fp16")
self.add_default_cmds(cmds)
-
+ print(" ".join(cmds))
+ exit(0)
golden_start_with = "Once upon a time,"
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
@@ -7750,7 +7753,10 @@ def test_llama_stories_110m(self):
# x86 does not allow weight sharing, so we don't check pte size
if not self.enable_x86_64:
pte_size = msg["pte_size"]
- self.assertLessEqual(pte_size, 135_000_000) # 135MB
+ if self.use_fp16:
+ self.assertLessEqual(pte_size, 275_000_000) # 275MB
+ else:
+ self.assertLessEqual(pte_size, 135_000_000) # 135MB
if not self.compile_only and not self.enable_x86_64:
self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai
@@ -10087,6 +10093,13 @@ def setup_environment():
choices=["wikitext_ppl", "hellaswag_acc_norm", "sqnr"],
type=str,
)
+ parser.add_argument(
+ "-F",
+ "--use_fp16",
+ help="If specified, will run in fp16 precision and discard ptq setting",
+ action="store_true",
+ default=False,
+ )
args, ns_args = parser.parse_known_args(namespace=unittest)
TestQNN.host = args.host
@@ -10114,6 +10127,7 @@ def setup_environment():
TestQNN.backend = args.backend
TestQNN.static_llm_eval_method = args.static_llm_eval_method
TestQNN.direct_build_folder = args.direct_build_folder
+ TestQNN.use_fp16 = args.use_fp16
return sys.argv[:1] + ns_args
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index d8802f74e68..c22ee8371e0 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -221,6 +221,7 @@ class TestQNN(unittest.TestCase):
static_llm_eval_method = ""
direct_build_folder: str = ""
dsp_heap_profile_filename = "htp_heap_usage.txt"
+ use_fp16 = False
@classmethod
def setUpClass(cls):
diff --git a/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte b/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte
index ad6bee06146c78f8fe1df1c77c610d72dcda8c13..5903c5b5c32277c0eaa795ae65c54370451900e8 100644
GIT binary patch
delta 306914
zcmcG%dt6j?{>Og~!;Gk-q5`4}7X|OAsAyEESX5Scjmipr5ShLJJcFfxZ3M%GXw
z5N1ZQVdM@mjDo?2QJ7>HixUl_D8Vq66QI~<7%SrVbC6-|9YEG-s_bVNjeUsI%P=Z>
z7{<|VhLPhoj4fftSPC?ZeXj+BQeK+lngcG>S1W(!g&Bsa2A}O25NH}7(d*7vgF!E1
z>k4dlo!V>L%fVm-5`H|F2Uf=nxTK*=61MJxU}r;H9P+%w!C;$%zkr?J3PP{dbw&)|
zha7QIT+H959oF`F=5sfN-(rqj`o{lMepN*&0u$$F&
zG-yzF8q~Y~e`=77$D42dTZ?)f-P6^g#{+}o23!)wnCTG;+xJOV{izOl<+s6Lri1^4
zonMOQC$~!;-n-GNKP&3`H>Zuza?1ZpFUsG#?|&@6yZ5bstG;yo>z%cFPCxpe$D_8(
zCT!`!uJ+VBoCX!Vf9pNdBHrp6j|byukt`yjS3_Jdx8&uI{
z6E=&5S}KyC8i(A6?@kB59y`BfWf%=z^6+)`Lf`b;qBo~)dAqAYCneD!Sxx?j2FVb8
z{o%j2D3U=74bf}A{^u5@ciDs;Vd)LED90h+iEo~R$Na0SMa5n6@Uh5gXx-WO-ZpK<
zyIn2%IE5C;`tm=v=+2M-y+zv^x>|I3`+sgxLzhk1Iu`Cwi&`A=Jbc?6`~~bnV-&^A
z(7hJE4>>LJ_xkPiX@?uST6EJ$S|lrsv?$ea<$w$H)yAKN@{kOSpB>k0*r&g5%(&7r
z^ozOV!72~ha4zo=+sg2Gti6Iq%m47?VX#AYQ14@1jl|qouO|X9ES#tkx7uN!PY%Cf
zu!rDhXO|-UdK8GV%!O;FNf1fEnlWnZE#}Y|)|3C#SSenv`v)n;O^eTSS4eP?|C
zz_ye7rZAsF^)CH^{5N2)e`>TYJ8;ZAJFq^@H#Fw+|DLP}&z*L%rvH#EiFu=wg|DM?
zuUlf$V8pv^+a7ef*_Ed$XYYw-$9Jd5yMv31VFVX;GaSIk=t)9ct3;-*;l8O3Ur$
zlud{o>S_MJX4*}|f1jzq(>IC!5~1OeInm9KIl+&4KTljKK$>$-pnPKF(2oC)L|udQ
z0*(KDBAPUGCv!B^r1O5Jq_`IrTspkG_Qu48OnJ2FZSvYN?E3LcE;tC
zqlSL@)ql@cgsVLXH&NH`v-#uuCNVQZ$@1w-Q)gPC<#wXN6C#Ixz_R<_n$?wP9VPuf
zkxax2re`RVbYv*gVlB5bk{hvL$;A4=sA%ZVIB^h;nu>DC0zexGjF8=opFkH=q8>3
z|F@nzkJ|o_ac~|JG}OMBj$kmfexzx6cHo^U7Y$wa-TyVq>H2YKW$IjcbC{f=EYgdi
z37@3p*@0K4UNm$S8+~a@sORKQXYZB3>#g29iF$tD7XP$O-&Y!_Iun
zUAzBYnRe00RXk}VUj|nJCUUiLji3|ctX#n%dq!^t!#*=_JE40L6eis0fu>tWrj
zET1KB$>Fb(O1ZIKE|Q0DG0nY{ZMHH>+1e+!>Hsc
zhzKss8-=VdABeg`Ts&!~>-)x8WutRJ6u5f65%(hTf
zwr`C?JIA55eVZKGvfpXTPm73-(h{j^#~-va9NLyYXiFSgnTw&S1_UJXHitI%587sj
zc8xxb=#JF4lwwd#tNSM(oy!3QNzQT5b`5S#*k6=
zi$~t{S!>yP{*+5hpk~hDQXgwZBbqH-|9`9*IXq;gT7KNwV8<}Yni1$f
zFFM-qut{m_ng)r{wGPdEhh{)v{k(|i!wzk^Lz@`w(@e6~IW)H49EY~mp-qe~b7?Y73DP2~w|r@J~g
zF?y#%Q}8>@QHQ4DcbXK!0)692h+J2`gcWBb|9>yDZ<=n*RI)|pvp-Ju=8tnTW
z(!}Tyno0J0r)F>-EDEt`y!P|gIu6T3v0F^OoJxm(1=ul-{%|A8To2`tsC#~I9~X$6
z7nSa}!h3noYd`qAN{BRgUEW!OGf}{Ier#`ftuJ-SW!>gmJ%OVOB31I)o)vxGdV0lq
z=biQOJ4ed4#~O2@{ZyLDW1Z0EU41Rp{g*86yZiF5GDUSjtCk1)UiwhqN4A|~Ih65$6^FM2
zyDoh?thTr1R#~_rtiKIsT(-F1(+TBPgf%>1#P+z?vTD6-ao@ex9@}fFFu;lWm
z!!mmY@^6?B$XQg_x9Gq$D~~MwV(?Jll|{PoI9XRL8+sxg8kFLDKMM;56
zue!}_3;gS1EcOfe`J%MsrpZ`@r^aDXNcf>
zHgbNh!_|SO?#?eQ7-1Ss!%ZWP>%cJ6$mQBU)HHIq_NJJ8He(uf$)+)fi;r3i-x$-_
zk%}GH(b1++&NYH-&nVL<;A$Ib8uxRha~&Xd30Ge_q;-gCG;q~&Rd5w^+Qk#z^*5UyNf}-z|~I;>yeR0x@zcv!1S7--6k3P!F^GshVN|;_6cl$YJ>o9
z;4K?O1di4CB6$9kAnk@aIgs_VTTDDS4G%20i6PN23xA_M_-mrt+H59xMu2}w4D34;
z*PV~5a$v_314o`tc7uLMR0&_0!#q{seAz_3IbJE(U)@j4vey_<(o~Q_H_J!#(@q@9LQ2jW$@lrR_0^rl5tI@!-pGbLoT!e7$8vC`
zt(a(3LLNvAOxfM78N(*eC6453aWq-A*uaAa@5o+v)R8Sh#p016_<@
zU8jH+RU%m1DVV@U9zl7hfQGdoxT{mZxJSL+9=x+tKnpVw{HaqgnT-H~H6rLGX=!LV
zf;(vTpOf97+Y?p(H|7veBlxz&z$&Es+k<6^YO6?7!M8FVw)B=n)gjUnq^lDH3C~%!
zeg?AzX+6@L8OrB|ON1K)+2fS*PJ8g$M76Jl9+?APW~*eJN}yK}sdl&UWQ&@gL0ltc4uO5Yhfp%*0v@}IZtF0>VNnc;h{
zJ$ON4U^fcTd|N?J870v3spf^z;h=K^bN>?GGs7CI2KYIhoM{W6+sP*wDGlwx*}}bI
zKhem6&*XM`VPZHaC(w7ts2*}_8#|z9SyX1Q(E^
zC!b)nz{hm*iAL0c_TcDFo^E8qM|Sc_Mj?E7C!cJT!-op@O1=|}TKEtq$BE`JcN2Ip
z(>vJ{@lks)AyLiBM4AqcN1D`~Zq9=YVwz`|gL}xNGu9v&Xld!gZIBqPl`b_J5ky;B
zjKe=}5BB3`_>rJ#;J$(JyOTZhLA~j|Q<0W|dm&A<=3ou12hq+W7t1O~W(3_ut*U-v
z^)o(UY1Vpq<6xvBaErdXl$Hw%C%4a=X7#L5g22@&pfS}5!a4;MT#vx)6cD5pfgyrk
z5`&rGYh)vt5J;#S;>iYfB&Z`lbH5gW+Yn+zk9RL9G+9
z_fsAh34z|P4DqA`k0+>uBF+QE!9{fH*?GyVP@G;wn78Jvd3XAJyPcWkr$#-26`e6@}BV7_Yq6NvfT@vC%
z9cmA@B&glLk|7iPO+sMm--mb#fnN(u?2cPG>|Zv+r51KLAux82Rc#~eE1Qurys16-
zWrEuJ8{tyHUx;RiCl}aE_r7Yml)(OJGZ`(|=Qfj}f_-K)87Ej%g4)zZINxXO!9${9
zh=2!eBi5uM*rzt*KCOT?G6=6(sdmCXu^FkFVIMPC?S%7w&hu9^3>NT!ZNyZ_gMDZ-
zmV`3c2R0*B4eWiJk*WdKkf2s|5UvgU9z9FAlz;N{7HIWd4(uJ9$*{rp+l*9|u(xeS
zsy(oM32JGOa4q1s5&}nFAL5B>=JB5pIJnnpP$ujjHlsm>us3Zc!vuT7W-?5$y$Sc_
zhwPQu*3a4xteL3Ic&13%7m4+GbNs}fYc;AZfH32L3-sIS_C
z6$xs$;B4?ZZau-p;57*_?pwio!1?sB;8yT_>KB~yulC?P>KB{~&Y^z6W#H4P-wmz>XHmc4X7EYWFF5M!
z_Fy{o3(f|QrGCN1;L+6Y37g=q0gnoNwr`Xw^@O?IP0$h0aqg&Z+JnPH5t#P2+dTu8
zBFtWbOQ1sn+ut6gW_QDD8#D~pz_#Jfv
zQ-5Hkjt`9eaFizx_$@U9%YiNNfpx&0z;CGc!%^-Q;Makx4-M;ig0#`s%8DCrJxzOB
zgE9lbXSC;|5#7>^6_8KFIPmZRw`U9NLv1U)U^F0jH(srZ3>)PR|B-dZjw`*I4&7_n
z4)YX%|89qzV62Di5<9i9Z_)kLKeY$9+R4Z5pN@8yfKIU$tj9(TG&ArFibl|6TOl3b{k1)q7U*?wv^x_t3dL|a
zQyN9El)w_?m7oMmE{(5;`2uep9PU028W=d%G`L%ak@TDFy9OUh^kjpg;?oI5p!iKmr!3O3-
zBPJV*!9V!|Q=3vECK(lwqmYC21{Ljr9`UKAv0=kJN5S9t)Pu1+6Jp!h0QdrfK1=Xq
z0>8xLvtgdaz%MN4!1~X^J@-Qo*@4rIJ&=!xG>~Po75t%3Z5l|_qz<00KK16nu(6(8
z@H;-$A#y+XEni^k=k{5t2Kt83VV(oPS8a`qS9p-8mQQURgi{)Lm#vcF&x5|?3*`TE
zm}d>}d0QissTTTYpURKJsR{h7FVMSLPPJa6BUmG3m?s1Hq%EJwH$^*wkNZ?_9}erm
zkJ^e!#!g6;FR&Yj!@!M}oKtO-*%7P^Y(M7no?!1-GZ8%0DUjzlf(JVV%&>9<4|EFT
zX^!CjP65-l5rOIy(8Tbt4(^^%7Dg-;!F?j=C247DE`kdBG+rL?;D9f%>PsH*z!T0zA`w+JR*E2FA_PIKOcl!bfU$OTBuJfrQA}#{2^#zt9t^ls_si}!{N*%b&
z7dY~jJzgzPztCZx*zk^EiLH_G%7(5WmN+d37uzZsuS)20UtrVWVV=FfB3mQl)e60e
zx{{bZDejKol@_*-x4G~uXrbuJz>Bp`lGVZ&G0ub0HG?nLI>{F0=?GrtQ@cf%4KB2G
z*14}3eo0{Pckw;btVbmfx*Ak5N90s3bEA>-4qJtxKdyD-%%z@2;u?T@?TXI>PYGAVh
z3%+Fy0?oAKvd($CcW}(!g*+2|n(V#hYZL
z!qYqXWFr?oLAdpD=met#KF+5S&NqjP>sY3Gl59lwLq_}5s`EK{wt+`k3VJui+Yua2
ze_nuM4tN-fWX}pvGEzB?%1UiiBVgY}Fs4mCB#|k4QRuLG96>_B?Oh_G-^Qm>9+
zoYq^1)f@zaL~osSi(vyS^)Po8I0j9!XD_I~c9Ib=S`qXUC+mdk>)pX2TJGdY_IiDUYJ8gqeYvKP5#l6nh|s5P3t<0tvU`&V3`_per}GT338k*Wsvl`vjC
z0Kbe=tI`P92L8e}V#1_EcLbYl#%#%f{nKWoDu#V-Gg4K;K8p)<5N;2!DNZdNPq-HF
zArV`B7sZ!m4%&(jEcxr(k$Eo}YglhzU6c;$yW;G~$
zKu7R^&1g_6>_eN;ce$_+Y$n45dtVq&I$%Sbnl+Jd_2BnxBO25Sd)H<($QRoY+#eTM
z)z0G%xGzo}5tkxxeOzE4;tJrKacXKh;+^1E>GuxK62O<4S3uvuj^Ing0OkOniwkT8
zmH?k253mOK1brCfECJliyaIX$u{DSbECpr)?~4m;0u}-9CUbCB5O?1?U>@-5xchdSquk}dtM2>O9PQo-x=h8Jz!u;_
zHB+F^$CF8|6qo}%M?E931UOTDDzFB4hUygtYy?hMQv`bBJAx;xB?2>n8ERvgIniAV
zJVBW*bDVn%aFV)MU?Xs%dPQJlLPv1C8XOMH298t5!c7jx;ilWY0zSs#)&gqOz(;HS
zNpg6Gj}*@7Ig#~Tz2e4owj0+O@SzsBT}$A}T7R;fpy7jsbA$#a+NwkkuCYlS!FbzM
z_5(%^JWlH~SYj(KQuGcZQra9N>@sKFh~4sRZ0?E!kqM#`u}aL6(r
z<*o;Rq{GT2XoJ5`xeGWCj~Kx^qQg!w=EL5!!%pjCx;<;)uj#O?T0E5yyn=Z))i#6w
zYKKjib7IuUj^Oiy)NavbgP+wp*>e=bpB$tTPD57>epKsN$&3d0#zAV8=)y;FWY9X+
zJR==`4?QlrLhxN$CwD*v{5G0#I=VgJQmvErx596v=S7z?x+8e4)=97C!k5tdqALS0
z(mH8iE&O8IcLus<@I_i@-8-r5dugBOvca>pPIhF)@H1)O40P4tQ$;u2`s|hiuwm{x@G!MH%1m}21`bi1V@&JgV((a4
zE#8VUhr2U?NtU}xisr^%3?HP9MVl$zC!`q_uqdLP6_()H0q#%Iv&^^%hOH6WPi>4Q
zc=$N$_KY@%c018XgSaHXa8DkveV|H+VZAN~|29y~Vvc%tfR7JUH^uNaxCz`kP~CYp
z8N$<;wgc7HvzfMO;O{NpVH8&uR!udmqhj!NFMK!$o>M!V_gz%kZL5=T@r4mdL)l_No2!;-?8(iqPFI%no9p+9&dK@Poue
z?#=_>C;iOhC!WDs1HZ@O*2Ak7y4F_v=P@iz@ITpVxv#w^upZdzqvD+bUu~;pxhRCL
zwAE?n;JqGxqwwMGoxl}pWdhI37T^s6?dPR0qa%2|`cyPI!0T*{WGjIdt0{?SYJk_;
z8cEa$y+&;mjrT-$H?~G@>P+Y|)t+c3dkTR|2CClY((H2ZRcdAu;#%Mp1696=8^Mdz
z#w2ztQ77>-K$(Miw$A_-s*44d054Im2;2rNP=kk<<2=p4d^-9(9xhQQcLWz$*natt
z4bNlHL{|*HNb96Q)$j|s7tcr60KP!$q)FkK9l`nB(V|NSpKt4|mk)*TbCsFG0#Xh<
zM_>}Sf>8^bYcpnLBW$+K$QC|Dc5PVty!J=)Mkm{R$B+}A_U`XHMLj3QY|%0Jb1MFY&ba&
z14n2tc?23!S+a*jXP?(Ip($D~V`D5vkZkEyS{_U6dU%pHu=WBw5hPj$*4e!onjlql
zemw1+E?dh^34NcA#MdREF?mShx+K(JhGbBegj6+1*n);4Q(Xg+*e(g}X+tugOG4kL
zoFZFP+83wz8WyK-c(lzU^JQ^|N7;Ov=qusOAs(mzbmJu$8^|F|(IJGb@cjH328^JwA!*eoQwxvLy^-TucQjv7;
zlE~ym6451*$%~|$C9ysZm&uF7!yQ*Z__)e4Ou*Go6hJ~q5
z$Jo^t!Ay%uwG}XfmG@#zOkltEcKqw(_B*-X=t0KGR)B5G?g)i
z{u-;sUP8V07Izm%2rb7RvDH(I2hhtTT#4G`S6{|LlCuTMH%UHE;g5A1$=ogyOx>jhj
zrZRkI@lrxl89(UfvFgZW#4G}TW~q2;f}6A%t=kDbq$#ayhJFgY953(Lyn0|nCd#uD
z{1Kxp@;vZ|v1*mb%fRnqvxw!o7F-{z@&z}8Un7p-sGN>q9UUk*8~hil5?l
z@GR;V+yI_I{er{i@YJCGtN7qM6P&I3o@5SF&t{n2-OJ&p*u2k)@M?IL&6f#30H12}
zw}pG>^5hUcN*yZV7@h?`N%%N-F)+hcEjx)5{1*6Ri`(zY8{z5dGjV-(39gazczZ0I
zR}R25TQ%)udF22fYjNAP2A*o`OPAu>1RtfgpNwnNIXpjXRi8|G)=WR0B`8xV6b@gboqP?j=`_7h-8WH!`6~LX%a~6c*AVc%scCr^>nvPq2BG
z@OpT>&6f*rgZpe=B|P<9d7@bH_6yI053=}%Y0z@$KwF(Ad?!5C<^{rA-~(*FUbyc(
zd5l=$>V)UOqiudvcnLg8wNH}`8?e6`lLc%9_EQT4de4^!$6A4zz(}=AU=gs7`c_~i
zu(yhz4y*_EQZog%0ljLaz|{HjjCe+19f$NpV6XMbuWnU8YUfcM&AC&*r>5&pLUD*rTI?0NGzB3Zf~6J!cz
z!e3CkPNR`Uz-?N^x@T0vpHlIsqpAm1YnAMC+TfeiN>QaQ;FCtJk{wPS{9g5`sLFxm
zTE%*6?1bO0rksJQ1?bl*=}=#ONAM=KQB*m=Vy$8gH%j13RlBHafQz+CTHOf0L@k(s
z%Da%=w5aUQ$uptnsa>Ke0?x5D(ixS|vsC<4`k(
zWcBHp98cB&Cn__?9NaD4*ajJ=#-C*-dK!RZ2dGsgY+8+r<((wnqupu1;c7c&x$}U7
zm3cN>tE82jdHnDs;fbDVVB!EZYo%;Hz=OnNXtxQ*aY&?1`0nT>GL5Au>)R^1&*a4*6QfPy0iR+7
z%MjOqpHyS!A#MO}iBYRW+y;I^twiijxk6qsFs{k!Xc+@B2fj)8aCZstL0gq{4&A>6
zzQN+w!>9pzudS}S4Nv1rKJ!2c+p@B><{U)
z5iGL=V?8UtOKhdyJs`MB^_tJZWi4sYD}}P8E#>`#t&t@%6S`O}5lsw-S1}
zdP_9*z{_lnEL?5SLN(?BG^xvEt)%JJ*(w)SU^ALr0?QYc?5+mp*$O(k9(JKHpY`no
z3T#C(ze|F%Z@DbBoi>!3jU=~ALYyKb^SUGyU4djymxO`aiDXungz}q_orgHN&}Py3Q5zR`#v
z-7>H;hhHn7U`gSD@18-O22T@pvU@&ojHT*1NygbILy&5_O6nTuXsIMlEx(&Jq5(e2
z=KJ$mBii62ZC)q(lw#R)TJ}i`(dWSV*h~B)*P}0n53~7l(O1Ix7)UjghFqoxK3zZ2Y7#MUc;W>}=naPh8?4eFFj?P}9u
zy=_LyJXkMbEX+VJ%Tk3b%;27COrh=40PA5hT-spWZAOJDH_C?9vLsawtedc8PchKL
zGwMFVRf65N5qHoYSh&r&ds<*F6@QtXD(WVAePJnPH6>#!^e0Wp=qq7oO&p1t4Q`EAM-(xOz(0T&+3x`=pxnLyu}I
zgSL{*jYT=dgO5b3gZIlR9{jB~qjhD_7ENhg4fN}1wQez$G=L9Bt5px+)dv2O2qI5e
z#rub7wNK=k^xQUd-cTICC_0l!Bau)7ibmfCfNEdHe(!PnKd0yBZHsQ4>^
zMZmwRnF1?;RRJ>w(XzX9TtZpH!a;O!do$2)(WX<^eaVDFVxZN-Yt%6L^o>D6j>1
zmwHQ}Z?(L+Zx@&YELCHQfF-~i)dGPvz-!f7fsMc=YL`IoE%HeER$wOZVimsxSOmOC
z%@kM(JXft0SPz`1o>{`n^zd8dz4Yp(9J~80<$#<4pKWpbGx;L;Oj|$gVb0c7@Uw-F
zb2k9bRCAZfVP9M$%H)N#aN9K#ewwZCQ-$kt_^HCjxvPQG)!^lF*k6Y00r)hF+pgZ*
zrl%2p2M>s7wN-TK;8Y9SeuePS+#Zjjs{oJEI`P{B
z9~rHdimnwr!os#+${HRD%uLbcf`@6H_?5wjGE=M3)q+#BPTJQDPiD4?F6vJ_2(*qH
z!pMdXrf$&{gOgYa9s^f{6KR{^25T_d;<97mfT2UmaxMXRZT
z_kahIPjD+ZHd<{GoN^cIJ#7)33yxuZ5L^b1rX5d!Yr#?6ae|w{{b_^XsB#weXtf*c
z&Ib2Y+i%bxHx)yBsn2dO2dk-D*eq1RBh=~}n85piVPZMjW2|LGj8ePr;jEAf{v}HF
zUcs%L2mV2A_M1b!6Vha>QjXxeD3!mNy^FPV*@56|%X+xHTRseJwj9)v6{g1%yNK(lv)gaKNMa1z8b;XQEJvxC>y|Ugp@2?M)=*Fd!p1*G59)B+8^InLzmIYXHYeOudr3p
zJ>mCu1TUqHqDluZuvOAGh0yb9;Z{@?;JLO+dS(ywY}zNPR`6-IO8O-wz;Z$BwsO3l
z1I|+8SFu}L0nAi${T#2QgDRmXtCg!b>g)xcWcjQ8t2pW$ho`H}rF`m7s+0V7rtfvNiyuVr}@&n*j3)|n*4p%%y`>U;g=KP!nZqYiK8hP-;{ngZ4@LL0J);j6f
zT6j}`mH!-mP2fgbXPwWy_sh3q2A6RLjl7jU%z%GjacdV{2z}31r`59SUJu`It0n7B
z=v%hBPP`Ap-?Y_|IO+j;q*}q7#5)VVNBD4e5%BNo*xl@L)1GG&S^@u?&DWQ)3*8BS
z#pZs|H^b{}-XwbOgM10W=KDpT4&QC_wA;|_y<@SQgIi@pZ_SGD~%GoJ4g
z?T7qDNTRzH_>wK3V8lMu5!_)DzD*5z(I)9e0ptZW{dPRcfY00V$;K8)txZla>LKhB
zZfA$r3Vgc1+VmnDMqj180caO-Ht^B@>WGMozz?gLcOb3+KEz<|K)e&I)J73E0|Shm
zh`k%-4Zyc|Fv&83_o(-Mz&zkx
zYKp*e;GfjRYuU={zu{ktJWYNR4rt$N+bj-wFhS6AOj
z^a|iAfg`zn_CQK(GQntt++>l6iAG8lF9iCleShV7kqf?|zv>WN2ELw~W+%87d>xZo
za5K1=ju9NSnaxsvwOepD_!_!ba54Dm{wiS?3r{t8nUtICt_Lm=Xnknk3cJc?w95C0
zEO?@mWgr{2h+w;9tiYG4XV%GB0WW2&cFS0SFIMd$ZU!!-BoTWbmDO?P-H6kH7xh;M
zMVtq|Ky5_qE(4yY`rc~}QKg&N<7|VUE3-8@?7*X{_r2!Oz|n*J#q02DSxk{9T6bqE
zbe8QF$=@IH81oU#wDc;ejvLMoKg%{)A1ekm2+p(v$o<;@J>Aj=3iwMvhVhuJ$kXqk
zq!BFgNLiPDj4Tl;4o&Vh#SE}Rs4O3!=Gex#|kduRB*CdBH~=&
z5azguOTdHGE)iD)llrS!uOhAoC#qf>5VryoSW86gdrBT!3q+g^^f5n0Tm+6&&xp7J
zIEeZA8seScfvR1^&A?dZr-;2z%j1qOq%c2$G0aa9=YgZuMiG|*qqy;2M_dE$uf7#=
z1F&CzwMxWo;J#|g{fJX)IH7aUlQZUJ*N
ze*kgRGn~u0k42mb?ygpfxDXgYf4_ma9NbO4CE{A3hyE6EBiOCtA4D9!mD4!=E#g$L
zODz#`E-;M#7I6vKRJ%l64K(QQH#t1kgL%^TeuzV5E4ZVd+9Yz{vmC(tseF-VgWLM4
zBO)&X|JF}!6?p~t*M4g1KalSP|I$x&h`bs6b3e6C0mB`b|s6@U338s0p*`)t&*ObTUZ-{n#v7Rk=5+}s7)1!?SOT@|ab9~?!e`n1
zVd3@g9Gkx_ybXSq&4a>IUzE3~l0l7o0-gsy-Qw0?VoNv55oC7>_`8b;PU#X%Hd+v5
zbqP)|d^==CKK2;HyYMj{B02CW7PnXE5_qO?nh!kLQoVTXh5{2=oh!nbA4|5=p26-zqc^l`rUcXCckoRm^yH-%UfE{OkK}E(z53a4yXK5IQgw!p`ZN#CtrvDe)+|5
z{M^9RQp`W4au27x3k~B%3>R4%`+s+0=j8FU{xCKJQNO_D-EA02Ek>zPco9{h%+!I!
z-GiZ%KTXxMxlTcMkgJa04!h+H(^xT&8G<|!`_Ruyu#=y}4&r56e-#EMIg-vajjRVv
zBbH!0!b|x@G7jfvu#>Ol7fp5$paSQ?IJd4OX(P!;7{(Za-bmr0pER(OKgzF8^KdD^{(E@J6ZS^IfUtA+>S7
zYk>K*dVap^4D%7yE^N1&-rYOE{OH9o7r2s5&XTHkPM??;{rRpvrdqzhm8p_^UY}}P
ztx?)j12O-^yMt9!D3sMyTbB=t82