From b3e10bc079de759b6ea1cc1a8ba525dcc59e7283 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Wed, 29 Oct 2025 18:44:57 +0100 Subject: [PATCH 01/11] chore: update v0.5.0 benchmarks (#481) --- docs/docs/04-benchmarks/inference-time.md | 96 ++++++++---------- docs/docs/04-benchmarks/memory-usage.md | 56 ++++++----- docs/docs/04-benchmarks/model-size.md | 6 +- .../04-benchmarks/inference-time.md | 98 +++++++++---------- .../04-benchmarks/memory-usage.md | 59 ++++++----- .../version-0.5.x/04-benchmarks/model-size.md | 12 ++- 6 files changed, 168 insertions(+), 159 deletions(-) diff --git a/docs/docs/04-benchmarks/inference-time.md b/docs/docs/04-benchmarks/inference-time.md index dd0f1275a..89f1f9de1 100644 --- a/docs/docs/04-benchmarks/inference-time.md +++ b/docs/docs/04-benchmarks/inference-time.md @@ -8,46 +8,48 @@ Times presented in the tables are measured as consecutive runs of the model. Ini ## Classification -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 105 | 110 | 149 | 299 | 227 | ## Object Detection -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 116 | 120 | 164 | 257 | 129 | ## Style Transfer -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1356 | 1550 | 2003 | 2578 | 2328 | +| STYLE_TRANSFER_MOSAIC | 1376 | 1456 | 1971 | 2657 | 2394 | +| STYLE_TRANSFER_UDNIE | 1389 | 1499 | 1858 | 2380 | 2124 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1339 | 1514 | 2004 | 2608 | 2371 | ## OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_800) | 2099 | 2227 | ❌ | 2245 | 7108 | -| Recognizer (CRNN_512) | 70 | 252 | ❌ | 54 | 151 | -| Recognizer (CRNN_256) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_128) | 17 | 83 | ❌ | 14 | 39 | +Notice that the recognizer models were executed between 3 and 7 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_800_QUANTIZED) | 669 | 649 | 825 | 541 | 474 | +| Recognizer (CRNN_512) | 48 | 47 | 60 | 91 | 72 | +| Recognizer (CRNN_256) | 22 | 22 | 29 | 51 | 30 | +| Recognizer (CRNN_128) | 11 | 11 | 14 | 28 | 17 | ## Vertical OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_1280) | 5457 | 5833 | ❌ | 6296 | 14053 | -| Detector (CRAFT_320) | 1351 | 1460 | ❌ | 1485 | 3101 | -| Recognizer (CRNN_512) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_64) | 10 | 33 | ❌ | 7 | 18 | +Notice that the recognizer models, as well as detector CRAFT_320 model, were executed between 4 and 21 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_1280_QUANTIZED) | 1749 | 1804 | 2105 | 1216 | 1171 | +| Detector (CRAFT_320_QUANTIZED) | 458 | 474 | 561 | 360 | 332 | +| Recognizer (CRNN_512) | 54 | 52 | 68 | 144 | 72 | +| Recognizer (CRNN_64) | 5 | 6 | 7 | 28 | 11 | ## LLMs @@ -62,41 +64,31 @@ Times presented in the tables are measured as consecutive runs of the model. Ini ❌ - Insufficient RAM. -### Streaming mode - -Notice than for `Whisper` model which has to take as an input 30 seconds audio chunks (for shorter audio it is automatically padded with silence to 30 seconds) `fast` mode has the lowest latency (time from starting transcription to first token returned, caused by streaming algorithm), but the slowest speed. If you believe that this might be a problem for you, prefer `balanced` mode instead. - -| Model (mode) | iPhone 16 Pro (XNNPACK) [latency \| tokens/s] | iPhone 14 Pro (XNNPACK) [latency \| tokens/s] | iPhone SE 3 (XNNPACK) [latency \| tokens/s] | Samsung Galaxy S24 (XNNPACK) [latency \| tokens/s] | OnePlus 12 (XNNPACK) [latency \| tokens/s] | -| ----------------------- | :-------------------------------------------: | :-------------------------------------------: | :-----------------------------------------: | :------------------------------------------------: | :----------------------------------------: | -| Whisper-tiny (fast) | 2.8s \| 5.5t/s | 3.7s \| 4.4t/s | 4.4s \| 3.4t/s | 5.5s \| 3.1t/s | 5.3s \| 3.8t/s | -| Whisper-tiny (balanced) | 5.6s \| 7.9t/s | 7.0s \| 6.3t/s | 8.3s \| 5.0t/s | 8.4s \| 6.7t/s | 7.7s \| 7.2t/s | -| Whisper-tiny (quality) | 10.3s \| 8.3t/s | 12.6s \| 6.8t/s | 7.8s \| 8.9t/s | 13.5s \| 7.1t/s | 12.9s \| 7.5t/s | - ### Encoding Average time for encoding audio of given length over 10 runs. For `Whisper` model we only list 30 sec audio chunks since `Whisper` does not accept other lengths (for shorter audio the audio needs to be padded to 30sec with silence). -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Whisper-tiny (30s) | 1034 | 1344 | 1269 | 2916 | 2143 | +| Whisper-tiny (30s) | 1391 | 1372 | 1894 | 1303 | 1214 | ### Decoding -Average time for decoding one token in sequence of 100 tokens, with encoding context is obtained from audio of noted length. +Average time for decoding one token in sequence of approximately 100 tokens, with encoding context is obtained from audio of noted length. -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Whisper-tiny (30s) | 128.03 | 113.65 | 141.63 | 89.08 | 84.49 | +| Whisper-tiny (30s) | 53 | 53 | 74 | 100 | 84 | ## Text Embeddings -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 15 | 22 | 23 | 36 | 31 | -| ALL_MPNET_BASE_V2 | 71 | 96 | 101 | 112 | 105 | -| MULTI_QA_MINILM_L6_COS_V1 | 15 | 22 | 23 | 36 | 31 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 71 | 95 | 100 | 112 | 105 | -| CLIP_VIT_BASE_PATCH32_TEXT | 31 | 47 | 48 | 55 | 49 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 16 | 16 | 19 | 54 | 28 | +| ALL_MPNET_BASE_V2 | 115 | 116 | 144 | 145 | 95 | +| MULTI_QA_MINILM_L6_COS_V1 | 16 | 16 | 20 | 47 | 28 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 112 | 119 | 144 | 146 | 96 | +| CLIP_VIT_BASE_PATCH32_TEXT | 47 | 45 | 57 | 65 | 48 | :::info Benchmark times for text embeddings are highly dependent on the sentence length. The numbers above are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. @@ -104,9 +96,9 @@ Benchmark times for text embeddings are highly dependent on the sentence length. ## Image Embeddings -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 48 | 64 | 69 | 65 | 63 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 70 | 70 | 90 | 66 | 58 | :::info Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. @@ -114,8 +106,6 @@ Image embedding benchmark times are measured using 224×224 pixel images, as req ## Text to Image -Average time for generating one image of size 256×256 in 10 inference steps. - -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :-------------------: | :-------------------------------: | :-----------------------: | -| BK_SDM_TINY_VPRED_256 | 19100 | 25000 | ❌ | ❌ | 23100 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| BK_SDM_TINY_VPRED_256 | 21184 | 21021 | ❌ | 18834 | 16617 | diff --git a/docs/docs/04-benchmarks/memory-usage.md b/docs/docs/04-benchmarks/memory-usage.md index e34c8a7ca..a0c5a7b6d 100644 --- a/docs/docs/04-benchmarks/memory-usage.md +++ b/docs/docs/04-benchmarks/memory-usage.md @@ -2,76 +2,80 @@ title: Memory Usage --- +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 (Android). +::: + ## Classification | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ----------------- | :--------------------: | :----------------: | -| EFFICIENTNET_V2_S | 130 | 85 | +| EFFICIENTNET_V2_S | 230 | 87 | ## Object Detection | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------------------------ | :--------------------: | :----------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 90 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 164 | 132 | ## Style Transfer | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ---------------------------- | :--------------------: | :----------------: | -| STYLE_TRANSFER_CANDY | 950 | 350 | -| STYLE_TRANSFER_MOSAIC | 950 | 350 | -| STYLE_TRANSFER_UDNIE | 950 | 350 | -| STYLE_TRANSFER_RAIN_PRINCESS | 950 | 350 | +| STYLE_TRANSFER_CANDY | 1200 | 380 | +| STYLE_TRANSFER_MOSAIC | 1200 | 380 | +| STYLE_TRANSFER_UDNIE | 1200 | 380 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1200 | 380 | ## OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_800) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 2100 | 1782 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------------------------------------------------------------------------ | :--------------------: | :----------------: | +| Detector (CRAFT_800_QUANTIZED) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1400 | 1320 | ## Vertical OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 2770 | 3720 | -| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1770 | 2740 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ---------------------------------------------------------------------------------------- | :--------------------: | :----------------: | +| Detector (CRAFT_1280_QUANTIZED) + Detector (CRAFT_320_QUANTIZED) + Recognizer (CRNN_512) | 1540 | 1470 | +| Detector(CRAFT_1280_QUANTIZED) + Detector(CRAFT_320_QUANTIZED) + Recognizer (CRNN_64) | 1070 | 1000 | ## LLMs | Model | Android (XNNPACK) [GB] | iOS (XNNPACK) [GB] | | --------------------- | :--------------------: | :----------------: | -| LLAMA3_2_1B | 3.2 | 3.1 | -| LLAMA3_2_1B_SPINQUANT | 1.9 | 2 | -| LLAMA3_2_1B_QLORA | 2.2 | 2.5 | +| LLAMA3_2_1B | 3.3 | 3.1 | +| LLAMA3_2_1B_SPINQUANT | 1.9 | 2.4 | +| LLAMA3_2_1B_QLORA | 2.7 | 2.8 | | LLAMA3_2_3B | 7.1 | 7.3 | | LLAMA3_2_3B_SPINQUANT | 3.7 | 3.8 | -| LLAMA3_2_3B_QLORA | 4 | 4.1 | +| LLAMA3_2_3B_QLORA | 3.9 | 4.0 | ## Speech to text | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------ | :--------------------: | :----------------: | -| WHISPER_TINY | 900 | 600 | +| WHISPER_TINY | 410 | 375 | ## Text Embeddings | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------- | :--------------------: | :----------------: | -| ALL_MINILM_L6_V2 | 85 | 100 | -| ALL_MPNET_BASE_V2 | 390 | 465 | -| MULTI_QA_MINILM_L6_COS_V1 | 115 | 130 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 415 | 490 | -| CLIP_VIT_BASE_PATCH32_TEXT | 195 | 250 | +| ALL_MINILM_L6_V2 | 95 | 110 | +| ALL_MPNET_BASE_V2 | 405 | 455 | +| MULTI_QA_MINILM_L6_COS_V1 | 120 | 140 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 435 | 455 | +| CLIP_VIT_BASE_PATCH32_TEXT | 200 | 280 | ## Image Embeddings | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | --------------------------- | :--------------------: | :----------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 350 | 340 | +| CLIP_VIT_BASE_PATCH32_IMAGE | 345 | 340 | ## Text to Image | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | --------------------- | ---------------------- | ------------------ | -| BK_SDM_TINY_VPRED_256 | 2900 | 2800 | -| BK_SDM_TINY_VPRED | 6700 | 6560 | +| BK_SDM_TINY_VPRED_256 | 2400 | 2400 | +| BK_SDM_TINY_VPRED | 6210 | 6050 | diff --git a/docs/docs/04-benchmarks/model-size.md b/docs/docs/04-benchmarks/model-size.md index 5cf87f6fa..30999fa7b 100644 --- a/docs/docs/04-benchmarks/model-size.md +++ b/docs/docs/04-benchmarks/model-size.md @@ -27,7 +27,7 @@ title: Model Size | Model | XNNPACK [MB] | | --------------------- | :----------: | -| Detector (CRAFT_800) | 83.1 | +| Detector (CRAFT_800) | 19.8 | | Recognizer (CRNN_512) | 15 - 18\* | | Recognizer (CRNN_256) | 16 - 18\* | | Recognizer (CRNN_128) | 17 - 19\* | @@ -38,8 +38,8 @@ title: Model Size | Model | XNNPACK [MB] | | ------------------------ | :----------: | -| Detector (CRAFT_1280) | 83.1 | -| Detector (CRAFT_320) | 83.1 | +| Detector (CRAFT_1280) | 19.8 | +| Detector (CRAFT_320) | 19.8 | | Recognizer (CRNN_EN_512) | 15 - 18\* | | Recognizer (CRNN_EN_64) | 15 - 16\* | diff --git a/docs/versioned_docs/version-0.5.x/04-benchmarks/inference-time.md b/docs/versioned_docs/version-0.5.x/04-benchmarks/inference-time.md index 504c0f6e9..89f1f9de1 100644 --- a/docs/versioned_docs/version-0.5.x/04-benchmarks/inference-time.md +++ b/docs/versioned_docs/version-0.5.x/04-benchmarks/inference-time.md @@ -8,46 +8,48 @@ Times presented in the tables are measured as consecutive runs of the model. Ini ## Classification -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 105 | 110 | 149 | 299 | 227 | ## Object Detection -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 116 | 120 | 164 | 257 | 129 | ## Style Transfer -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1356 | 1550 | 2003 | 2578 | 2328 | +| STYLE_TRANSFER_MOSAIC | 1376 | 1456 | 1971 | 2657 | 2394 | +| STYLE_TRANSFER_UDNIE | 1389 | 1499 | 1858 | 2380 | 2124 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1339 | 1514 | 2004 | 2608 | 2371 | ## OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_800) | 2099 | 2227 | ❌ | 2245 | 7108 | -| Recognizer (CRNN_512) | 70 | 252 | ❌ | 54 | 151 | -| Recognizer (CRNN_256) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_128) | 17 | 83 | ❌ | 14 | 39 | +Notice that the recognizer models were executed between 3 and 7 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_800_QUANTIZED) | 669 | 649 | 825 | 541 | 474 | +| Recognizer (CRNN_512) | 48 | 47 | 60 | 91 | 72 | +| Recognizer (CRNN_256) | 22 | 22 | 29 | 51 | 30 | +| Recognizer (CRNN_128) | 11 | 11 | 14 | 28 | 17 | ## Vertical OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_1280) | 5457 | 5833 | ❌ | 6296 | 14053 | -| Detector (CRAFT_320) | 1351 | 1460 | ❌ | 1485 | 3101 | -| Recognizer (CRNN_512) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_64) | 10 | 33 | ❌ | 7 | 18 | +Notice that the recognizer models, as well as detector CRAFT_320 model, were executed between 4 and 21 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_1280_QUANTIZED) | 1749 | 1804 | 2105 | 1216 | 1171 | +| Detector (CRAFT_320_QUANTIZED) | 458 | 474 | 561 | 360 | 332 | +| Recognizer (CRNN_512) | 54 | 52 | 68 | 144 | 72 | +| Recognizer (CRNN_64) | 5 | 6 | 7 | 28 | 11 | ## LLMs @@ -62,41 +64,31 @@ Times presented in the tables are measured as consecutive runs of the model. Ini ❌ - Insufficient RAM. -### Streaming mode - -Notice than for `Whisper` model which has to take as an input 30 seconds audio chunks (for shorter audio it is automatically padded with silence to 30 seconds) `fast` mode has the lowest latency (time from starting transcription to first token returned, caused by streaming algorithm), but the slowest speed. If you believe that this might be a problem for you, prefer `balanced` mode instead. - -| Model (mode) | iPhone 16 Pro (XNNPACK) [latency \| tokens/s] | iPhone 14 Pro (XNNPACK) [latency \| tokens/s] | iPhone SE 3 (XNNPACK) [latency \| tokens/s] | Samsung Galaxy S24 (XNNPACK) [latency \| tokens/s] | OnePlus 12 (XNNPACK) [latency \| tokens/s] | -| ------------------------- | :-------------------------------------------: | :-------------------------------------------: | :-----------------------------------------: | :------------------------------------------------: | :----------------------------------------: | -| Whisper-tiny (fast) | 2.8s \| 5.5t/s | 3.7s \| 4.4t/s | 4.4s \| 3.4t/s | 5.5s \| 3.1t/s | 5.3s \| 3.8t/s | -| Whisper-tiny (balanced) | 5.6s \| 7.9t/s | 7.0s \| 6.3t/s | 8.3s \| 5.0t/s | 8.4s \| 6.7t/s | 7.7s \| 7.2t/s | -| Whisper-tiny (quality) | 10.3s \| 8.3t/s | 12.6s \| 6.8t/s | 7.8s \| 8.9t/s | 13.5s \| 7.1t/s | 12.9s \| 7.5t/s | - ### Encoding Average time for encoding audio of given length over 10 runs. For `Whisper` model we only list 30 sec audio chunks since `Whisper` does not accept other lengths (for shorter audio the audio needs to be padded to 30sec with silence). -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| -------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Whisper-tiny (30s) | 1034 | 1344 | 1269 | 2916 | 2143 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Whisper-tiny (30s) | 1391 | 1372 | 1894 | 1303 | 1214 | ### Decoding -Average time for decoding one token in sequence of 100 tokens, with encoding context is obtained from audio of noted length. +Average time for decoding one token in sequence of approximately 100 tokens, with encoding context is obtained from audio of noted length. -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| -------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Whisper-tiny (30s) | 128.03 | 113.65 | 141.63 | 89.08 | 84.49 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Whisper-tiny (30s) | 53 | 53 | 74 | 100 | 84 | ## Text Embeddings -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 15 | 22 | 23 | 36 | 31 | -| ALL_MPNET_BASE_V2 | 71 | 96 | 101 | 112 | 105 | -| MULTI_QA_MINILM_L6_COS_V1 | 15 | 22 | 23 | 36 | 31 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 71 | 95 | 100 | 112 | 105 | -| CLIP_VIT_BASE_PATCH32_TEXT | 31 | 47 | 48 | 55 | 49 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 16 | 16 | 19 | 54 | 28 | +| ALL_MPNET_BASE_V2 | 115 | 116 | 144 | 145 | 95 | +| MULTI_QA_MINILM_L6_COS_V1 | 16 | 16 | 20 | 47 | 28 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 112 | 119 | 144 | 146 | 96 | +| CLIP_VIT_BASE_PATCH32_TEXT | 47 | 45 | 57 | 65 | 48 | :::info Benchmark times for text embeddings are highly dependent on the sentence length. The numbers above are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. @@ -104,10 +96,16 @@ Benchmark times for text embeddings are highly dependent on the sentence length. ## Image Embeddings -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 48 | 64 | 69 | 65 | 63 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 70 | 70 | 90 | 66 | 58 | :::info Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. ::: + +## Text to Image + +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| BK_SDM_TINY_VPRED_256 | 21184 | 21021 | ❌ | 18834 | 16617 | diff --git a/docs/versioned_docs/version-0.5.x/04-benchmarks/memory-usage.md b/docs/versioned_docs/version-0.5.x/04-benchmarks/memory-usage.md index 684020e2a..a0c5a7b6d 100644 --- a/docs/versioned_docs/version-0.5.x/04-benchmarks/memory-usage.md +++ b/docs/versioned_docs/version-0.5.x/04-benchmarks/memory-usage.md @@ -2,69 +2,80 @@ title: Memory Usage --- +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 (Android). +::: + ## Classification | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ----------------- | :--------------------: | :----------------: | -| EFFICIENTNET_V2_S | 130 | 85 | +| EFFICIENTNET_V2_S | 230 | 87 | ## Object Detection | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------------------------ | :--------------------: | :----------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 90 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 164 | 132 | ## Style Transfer | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ---------------------------- | :--------------------: | :----------------: | -| STYLE_TRANSFER_CANDY | 950 | 350 | -| STYLE_TRANSFER_MOSAIC | 950 | 350 | -| STYLE_TRANSFER_UDNIE | 950 | 350 | -| STYLE_TRANSFER_RAIN_PRINCESS | 950 | 350 | +| STYLE_TRANSFER_CANDY | 1200 | 380 | +| STYLE_TRANSFER_MOSAIC | 1200 | 380 | +| STYLE_TRANSFER_UDNIE | 1200 | 380 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1200 | 380 | ## OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_800) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 2100 | 1782 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------------------------------------------------------------------------ | :--------------------: | :----------------: | +| Detector (CRAFT_800_QUANTIZED) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1400 | 1320 | ## Vertical OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 2770 | 3720 | -| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1770 | 2740 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ---------------------------------------------------------------------------------------- | :--------------------: | :----------------: | +| Detector (CRAFT_1280_QUANTIZED) + Detector (CRAFT_320_QUANTIZED) + Recognizer (CRNN_512) | 1540 | 1470 | +| Detector(CRAFT_1280_QUANTIZED) + Detector(CRAFT_320_QUANTIZED) + Recognizer (CRNN_64) | 1070 | 1000 | ## LLMs | Model | Android (XNNPACK) [GB] | iOS (XNNPACK) [GB] | | --------------------- | :--------------------: | :----------------: | -| LLAMA3_2_1B | 3.2 | 3.1 | -| LLAMA3_2_1B_SPINQUANT | 1.9 | 2 | -| LLAMA3_2_1B_QLORA | 2.2 | 2.5 | +| LLAMA3_2_1B | 3.3 | 3.1 | +| LLAMA3_2_1B_SPINQUANT | 1.9 | 2.4 | +| LLAMA3_2_1B_QLORA | 2.7 | 2.8 | | LLAMA3_2_3B | 7.1 | 7.3 | | LLAMA3_2_3B_SPINQUANT | 3.7 | 3.8 | -| LLAMA3_2_3B_QLORA | 4 | 4.1 | +| LLAMA3_2_3B_QLORA | 3.9 | 4.0 | ## Speech to text | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------ | :--------------------: | :----------------: | -| WHISPER_TINY | 900 | 600 | +| WHISPER_TINY | 410 | 375 | ## Text Embeddings | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------- | :--------------------: | :----------------: | -| ALL_MINILM_L6_V2 | 85 | 100 | -| ALL_MPNET_BASE_V2 | 390 | 465 | -| MULTI_QA_MINILM_L6_COS_V1 | 115 | 130 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 415 | 490 | -| CLIP_VIT_BASE_PATCH32_TEXT | 195 | 250 | +| ALL_MINILM_L6_V2 | 95 | 110 | +| ALL_MPNET_BASE_V2 | 405 | 455 | +| MULTI_QA_MINILM_L6_COS_V1 | 120 | 140 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 435 | 455 | +| CLIP_VIT_BASE_PATCH32_TEXT | 200 | 280 | ## Image Embeddings | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | --------------------------- | :--------------------: | :----------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 350 | 340 | +| CLIP_VIT_BASE_PATCH32_IMAGE | 345 | 340 | + +## Text to Image + +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| --------------------- | ---------------------- | ------------------ | +| BK_SDM_TINY_VPRED_256 | 2400 | 2400 | +| BK_SDM_TINY_VPRED | 6210 | 6050 | diff --git a/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md b/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md index 9d20c95d5..30999fa7b 100644 --- a/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md +++ b/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md @@ -27,7 +27,7 @@ title: Model Size | Model | XNNPACK [MB] | | --------------------- | :----------: | -| Detector (CRAFT_800) | 83.1 | +| Detector (CRAFT_800) | 19.8 | | Recognizer (CRNN_512) | 15 - 18\* | | Recognizer (CRNN_256) | 16 - 18\* | | Recognizer (CRNN_128) | 17 - 19\* | @@ -38,8 +38,8 @@ title: Model Size | Model | XNNPACK [MB] | | ------------------------ | :----------: | -| Detector (CRAFT_1280) | 83.1 | -| Detector (CRAFT_320) | 83.1 | +| Detector (CRAFT_1280) | 19.8 | +| Detector (CRAFT_320) | 19.8 | | Recognizer (CRNN_EN_512) | 15 - 18\* | | Recognizer (CRNN_EN_64) | 15 - 16\* | @@ -82,3 +82,9 @@ title: Model Size | Model | XNNPACK [MB] | | --------------------------- | :----------: | | CLIP_VIT_BASE_PATCH32_IMAGE | 352 | + +## Text to Image + +| Model | Text encoder (XNNPACK) [MB] | UNet (XNNPACK) [MB] | VAE decoder (XNNPACK) [MB] | +| ----------------- | --------------------------- | ------------------- | -------------------------- | +| BK_SDM_TINY_VPRED | 492 | 1290 | 198 | From 8fbfd006e0f1939def764baa867f64a65f2ae52d Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Mon, 3 Nov 2025 08:14:02 +0100 Subject: [PATCH 02/11] chore: update v0.4.0 benchmarks (#481) --- .../benchmarks/inference-time.md | 68 ++++++++++--------- .../version-0.4.x/benchmarks/memory-usage.md | 14 ++-- .../version-0.4.x/benchmarks/model-size.md | 24 +++---- 3 files changed, 54 insertions(+), 52 deletions(-) diff --git a/docs/versioned_docs/version-0.4.x/benchmarks/inference-time.md b/docs/versioned_docs/version-0.4.x/benchmarks/inference-time.md index da35e7b6e..f5d6d0113 100644 --- a/docs/versioned_docs/version-0.4.x/benchmarks/inference-time.md +++ b/docs/versioned_docs/version-0.4.x/benchmarks/inference-time.md @@ -8,50 +8,52 @@ Times presented in the tables are measured as consecutive runs of the model. Ini ## Classification -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 150 | 161 | 227 | 196 | 214 | ## Object Detection -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 261 | 279 | 414 | 125 | 115 | ## Style Transfer -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_MOSAIC | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_UDNIE | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1565 | 1675 | 2325 | 1750 | 1620 | ## OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_800) | 2099 | 2227 | ❌ | 2245 | 7108 | -| Recognizer (CRNN_512) | 70 | 252 | ❌ | 54 | 151 | -| Recognizer (CRNN_256) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_128) | 17 | 83 | ❌ | 14 | 39 | +Notice that the recognizer models were executed between 3 and 7 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_800_QUANTIZED) | 779 | 897 | 1276 | 553 | 586 | +| Recognizer (CRNN_512) | 77 | 74 | 244 | 56 | 57 | +| Recognizer (CRNN_256) | 35 | 37 | 120 | 28 | 30 | +| Recognizer (CRNN_128) | 18 | 19 | 60 | 14 | 16 | ## Vertical OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_1280) | 5457 | 5833 | ❌ | 6296 | 14053 | -| Detector (CRAFT_320) | 1351 | 1460 | ❌ | 1485 | 3101 | -| Recognizer (CRNN_512) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_64) | 10 | 33 | ❌ | 7 | 18 | +Notice that the recognizer models, as well as detector CRAFT_320 model, were executed between 4 and 21 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_1280_QUANTIZED) | 1918 | 2304 | 3371 | 1391 | 1445 | +| Detector (CRAFT_320_QUANTIZED) | 473 | 563 | 813 | 361 | 382 | +| Recognizer (CRNN_512) | 78 | 83 | 310 | 59 | 57 | +| Recognizer (CRNN_64) | 9 | 9 | 38 | 8 | 7 | ## LLMs -| Model | iPhone 16 Pro (XNNPACK) [tokens/s] | iPhone 13 Pro (XNNPACK) [tokens/s] | iPhone SE 3 (XNNPACK) [tokens/s] | Samsung Galaxy S24 (XNNPACK) [tokens/s] | OnePlus 12 (XNNPACK) [tokens/s] | +| Model | iPhone 17 Pro (XNNPACK) [tokens/s] | iPhone 16 Pro (XNNPACK) [tokens/s] | iPhone SE 3 (XNNPACK) [tokens/s] | Samsung Galaxy S24 (XNNPACK) [tokens/s] | OnePlus 12 (XNNPACK) [tokens/s] | | --------------------- | :--------------------------------: | :--------------------------------: | :------------------------------: | :-------------------------------------: | :-----------------------------: | | LLAMA3_2_1B | 16.1 | 11.4 | ❌ | 15.6 | 19.3 | | LLAMA3_2_1B_SPINQUANT | 40.6 | 16.7 | 16.5 | 40.3 | 48.2 | @@ -68,7 +70,7 @@ Times presented in the tables are measured as consecutive runs of the model. Ini Notice than for `Whisper` model which has to take as an input 30 seconds audio chunks (for shorter audio it is automatically padded with silence to 30 seconds) `fast` mode has the lowest latency (time from starting transcription to first token returned, caused by streaming algorithm), but the slowest speed. That's why for the lowest latency and the fastest transcription we suggest using `Moonshine` model, if you still want to proceed with `Whisper` use preferably the `balanced` mode. -| Model (mode) | iPhone 16 Pro (XNNPACK) [latency \| tokens/s] | iPhone 14 Pro (XNNPACK) [latency \| tokens/s] | iPhone SE 3 (XNNPACK) [latency \| tokens/s] | Samsung Galaxy S24 (XNNPACK) [latency \| tokens/s] | OnePlus 12 (XNNPACK) [latency \| tokens/s] | +| Model (mode) | iPhone 17 Pro (XNNPACK) [latency \| tokens/s] | iPhone 16 Pro (XNNPACK) [latency \| tokens/s] | iPhone SE 3 (XNNPACK) [latency \| tokens/s] | Samsung Galaxy S24 (XNNPACK) [latency \| tokens/s] | OnePlus 12 (XNNPACK) [latency \| tokens/s] | | ------------------------- | :-------------------------------------------: | :-------------------------------------------: | :-----------------------------------------: | :------------------------------------------------: | :----------------------------------------: | | Moonshine-tiny (fast) | 0.8s \| 19.0t/s | 1.5s \| 11.3t/s | 1.5s \| 10.4t/s | 2.0s \| 8.8t/s | 1.6s \| 12.5t/s | | Moonshine-tiny (balanced) | 2.0s \| 20.0t/s | 3.2s \| 12.4t/s | 3.7s \| 10.4t/s | 4.6s \| 11.2t/s | 3.4s \| 14.6t/s | @@ -81,7 +83,7 @@ Notice than for `Whisper` model which has to take as an input 30 seconds audio c Average time for encoding audio of given length over 10 runs. For `Whisper` model we only list 30 sec audio chunks since `Whisper` does not accept other lengths (for shorter audio the audio needs to be padded to 30sec with silence). -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | -------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | | Moonshine-tiny (5s) | 99 | 95 | 115 | 284 | 277 | | Moonshine-tiny (10s) | 178 | 177 | 204 | 555 | 528 | @@ -92,7 +94,7 @@ Average time for encoding audio of given length over 10 runs. For `Whisper` mode Average time for decoding one token in sequence of 100 tokens, with encoding context is obtained from audio of noted length. -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | -------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | | Moonshine-tiny (5s) | 48.98 | 47.98 | 46.86 | 36.70 | 29.03 | | Moonshine-tiny (10s) | 54.24 | 51.74 | 55.07 | 46.31 | 32.41 | @@ -101,9 +103,9 @@ Average time for decoding one token in sequence of 100 tokens, with encoding con ## Text Embeddings -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 53 | 69 | 78 | 60 | 65 | -| ALL_MPNET_BASE_V2 | 352 | 423 | 478 | 521 | 527 | -| MULTI_QA_MINILM_L6_COS_V1 | 135 | 166 | 180 | 158 | 165 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 503 | 598 | 680 | 694 | 743 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 50 | 58 | 84 | 58 | 58 | +| ALL_MPNET_BASE_V2 | 352 | 428 | 879 | 483 | 517 | +| MULTI_QA_MINILM_L6_COS_V1 | 133 | 161 | 269 | 151 | 155 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 502 | 796 | 1216 | 915 | 713 | diff --git a/docs/versioned_docs/version-0.4.x/benchmarks/memory-usage.md b/docs/versioned_docs/version-0.4.x/benchmarks/memory-usage.md index 862ffd574..25298f630 100644 --- a/docs/versioned_docs/version-0.4.x/benchmarks/memory-usage.md +++ b/docs/versioned_docs/version-0.4.x/benchmarks/memory-usage.md @@ -25,16 +25,16 @@ title: Memory Usage ## OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_800) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 2100 | 1782 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------------------------------------------------------------------------ | :--------------------: | :----------------: | +| Detector (CRAFT_800_QUANTIZED) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1400 | 1320 | ## Vertical OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 2770 | 3720 | -| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1770 | 2740 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ---------------------------------------------------------------------------------------- | :--------------------: | :----------------: | +| Detector (CRAFT_1280_QUANTIZED) + Detector (CRAFT_320_QUANTIZED) + Recognizer (CRNN_512) | 1540 | 1470 | +| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1070 | 1000 | ## LLMs diff --git a/docs/versioned_docs/version-0.4.x/benchmarks/model-size.md b/docs/versioned_docs/version-0.4.x/benchmarks/model-size.md index f39fa2f14..d5e890120 100644 --- a/docs/versioned_docs/version-0.4.x/benchmarks/model-size.md +++ b/docs/versioned_docs/version-0.4.x/benchmarks/model-size.md @@ -25,23 +25,23 @@ title: Model Size ## OCR -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_800) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_256) | 16 - 18\* | -| Recognizer (CRNN_128) | 17 - 19\* | +| Model | XNNPACK [MB] | +| ------------------------------ | :----------: | +| Detector (CRAFT_800_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_256) | 16 - 18\* | +| Recognizer (CRNN_128) | 17 - 19\* | \* - The model weights vary depending on the language. ## Vertical OCR -| Model | XNNPACK [MB] | -| ------------------------ | :----------: | -| Detector (CRAFT_1280) | 83.1 | -| Detector (CRAFT_320) | 83.1 | -| Recognizer (CRNN_EN_512) | 15 - 18\* | -| Recognizer (CRNN_EN_64) | 15 - 16\* | +| Model | XNNPACK [MB] | +| ------------------------------- | :----------: | +| Detector (CRAFT_1280_QUANTIZED) | 19.8 | +| Detector (CRAFT_320_QUANTIZED) | 19.8 | +| Recognizer (CRNN_EN_512) | 15 - 18\* | +| Recognizer (CRNN_EN_64) | 15 - 16\* | \* - The model weights vary depending on the language. From b9de78e5d01ea2999a1e09bae1d9a1e208375b4f Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Mon, 3 Nov 2025 08:31:02 +0100 Subject: [PATCH 03/11] chore: minor naming fixes --- docs/docs/04-benchmarks/model-size.md | 24 +++++++++---------- .../version-0.5.x/04-benchmarks/model-size.md | 24 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/docs/04-benchmarks/model-size.md b/docs/docs/04-benchmarks/model-size.md index 30999fa7b..128cbd7fb 100644 --- a/docs/docs/04-benchmarks/model-size.md +++ b/docs/docs/04-benchmarks/model-size.md @@ -25,23 +25,23 @@ title: Model Size ## OCR -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_800) | 19.8 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_256) | 16 - 18\* | -| Recognizer (CRNN_128) | 17 - 19\* | +| Model | XNNPACK [MB] | +| ------------------------------ | :----------: | +| Detector (CRAFT_800_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_256) | 16 - 18\* | +| Recognizer (CRNN_128) | 17 - 19\* | \* - The model weights vary depending on the language. ## Vertical OCR -| Model | XNNPACK [MB] | -| ------------------------ | :----------: | -| Detector (CRAFT_1280) | 19.8 | -| Detector (CRAFT_320) | 19.8 | -| Recognizer (CRNN_EN_512) | 15 - 18\* | -| Recognizer (CRNN_EN_64) | 15 - 16\* | +| Model | XNNPACK [MB] | +| ------------------------------- | :----------: | +| Detector (CRAFT_1280_QUANTIZED) | 19.8 | +| Detector (CRAFT_320_QUANTIZED) | 19.8 | +| Recognizer (CRNN_EN_512) | 15 - 18\* | +| Recognizer (CRNN_EN_64) | 15 - 16\* | \* - The model weights vary depending on the language. diff --git a/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md b/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md index 30999fa7b..128cbd7fb 100644 --- a/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md +++ b/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md @@ -25,23 +25,23 @@ title: Model Size ## OCR -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_800) | 19.8 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_256) | 16 - 18\* | -| Recognizer (CRNN_128) | 17 - 19\* | +| Model | XNNPACK [MB] | +| ------------------------------ | :----------: | +| Detector (CRAFT_800_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_256) | 16 - 18\* | +| Recognizer (CRNN_128) | 17 - 19\* | \* - The model weights vary depending on the language. ## Vertical OCR -| Model | XNNPACK [MB] | -| ------------------------ | :----------: | -| Detector (CRAFT_1280) | 19.8 | -| Detector (CRAFT_320) | 19.8 | -| Recognizer (CRNN_EN_512) | 15 - 18\* | -| Recognizer (CRNN_EN_64) | 15 - 16\* | +| Model | XNNPACK [MB] | +| ------------------------------- | :----------: | +| Detector (CRAFT_1280_QUANTIZED) | 19.8 | +| Detector (CRAFT_320_QUANTIZED) | 19.8 | +| Recognizer (CRNN_EN_512) | 15 - 18\* | +| Recognizer (CRNN_EN_64) | 15 - 16\* | \* - The model weights vary depending on the language. From bbd7f530d6e91f4aedf2c7af3bd69f9b16dcfaa7 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Mon, 17 Nov 2025 10:46:25 +0100 Subject: [PATCH 04/11] chore: update docs subsections --- .../useSpeechToText.md | 30 ++-- .../useTextEmbeddings.md | 24 ++-- .../02-computer-vision/useClassification.md | 6 +- .../02-computer-vision/useImageEmbeddings.md | 6 +- .../02-hooks/02-computer-vision/useOCR.md | 46 +++--- .../02-computer-vision/useObjectDetection.md | 6 +- .../02-computer-vision/useStyleTransfer.md | 18 +-- .../02-computer-vision/useTextToImage.md | 6 +- .../02-computer-vision/useVerticalOCR.md | 44 +++--- .../computer-vision/useClassification.md | 8 +- .../version-0.4.x/computer-vision/useOCR.md | 14 +- .../computer-vision/useObjectDetection.md | 4 +- .../computer-vision/useStyleTransfer.md | 10 +- .../computer-vision/useVerticalOCR.md | 14 +- .../useTextEmbeddings.md | 12 +- .../useSpeechToText.md | 30 ++-- .../useTextEmbeddings.md | 24 ++-- .../02-computer-vision/useClassification.md | 6 +- .../02-computer-vision/useImageEmbeddings.md | 10 +- .../02-hooks/02-computer-vision/useOCR.md | 60 ++++---- .../02-computer-vision/useObjectDetection.md | 6 +- .../02-computer-vision/useStyleTransfer.md | 18 +-- .../02-computer-vision/useTextToImage.md | 133 ++++++++++++++++++ .../02-computer-vision/useVerticalOCR.md | 58 ++++---- 24 files changed, 357 insertions(+), 236 deletions(-) create mode 100644 docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useTextToImage.md diff --git a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md index 8876bf37e..d94c96a66 100644 --- a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md @@ -75,20 +75,20 @@ For more information on loading resources, take a look at [loading models](../.. ### Returns -| Field | Type | Description | -| --------------------------- | ---------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `transcribe` | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | -| `stream` | `(options?: DecodingOptions \| undefined) => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | -| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | -| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | -| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | -| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | -| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | -| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | -| `error` | `string \| null` | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Tracks the progress of the model download process. | +| Field | Type | Description | +| --------------------------- | ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `transcribe` | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | +| `stream` | `(options?: DecodingOptions \| undefined) => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | +| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | +| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | +| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | +| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | +| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | +| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | +| `error` | `string \| null` | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Tracks the progress of the model download process. |
Type definitions @@ -340,4 +340,4 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------ | :--------------------: | :----------------: | -| WHISPER_TINY | 900 | 600 | +| WHISPER_TINY | 410 | 375 | diff --git a/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md b/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md index c40d19e94..fd595d208 100644 --- a/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md +++ b/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md @@ -133,11 +133,11 @@ For the supported models, the returned embedding vector is normalized, meaning t | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------- | :--------------------: | :----------------: | -| ALL_MINILM_L6_V2 | 85 | 100 | -| ALL_MPNET_BASE_V2 | 390 | 465 | -| MULTI_QA_MINILM_L6_COS_V1 | 115 | 130 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 415 | 490 | -| CLIP_VIT_BASE_PATCH32_TEXT | 195 | 250 | +| ALL_MINILM_L6_V2 | 95 | 110 | +| ALL_MPNET_BASE_V2 | 405 | 455 | +| MULTI_QA_MINILM_L6_COS_V1 | 120 | 140 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 435 | 455 | +| CLIP_VIT_BASE_PATCH32_TEXT | 200 | 280 | ### Inference time @@ -145,13 +145,13 @@ For the supported models, the returned embedding vector is normalized, meaning t Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 15 | 22 | 23 | 36 | 31 | -| ALL_MPNET_BASE_V2 | 71 | 96 | 101 | 112 | 105 | -| MULTI_QA_MINILM_L6_COS_V1 | 15 | 22 | 23 | 36 | 31 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 71 | 95 | 100 | 112 | 105 | -| CLIP_VIT_BASE_PATCH32_TEXT | 31 | 47 | 48 | 55 | 49 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 16 | 16 | 19 | 54 | 28 | +| ALL_MPNET_BASE_V2 | 115 | 116 | 144 | 145 | 95 | +| MULTI_QA_MINILM_L6_COS_V1 | 16 | 16 | 20 | 47 | 28 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 112 | 119 | 144 | 146 | 96 | +| CLIP_VIT_BASE_PATCH32_TEXT | 47 | 45 | 57 | 65 | 48 | :::info Benchmark times for text embeddings are highly dependent on the sentence length. The numbers above are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. diff --git a/docs/docs/02-hooks/02-computer-vision/useClassification.md b/docs/docs/02-hooks/02-computer-vision/useClassification.md index b4d3f34a6..e17bfa775 100644 --- a/docs/docs/02-hooks/02-computer-vision/useClassification.md +++ b/docs/docs/02-hooks/02-computer-vision/useClassification.md @@ -100,7 +100,7 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ----------------- | :--------------------: | :----------------: | -| EFFICIENTNET_V2_S | 130 | 85 | +| EFFICIENTNET_V2_S | 230 | 87 | ### Inference time @@ -108,6 +108,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 105 | 110 | 149 | 299 | 227 | diff --git a/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md b/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md index 6dbdc7dcc..4d417590c 100644 --- a/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md +++ b/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md @@ -123,9 +123,9 @@ For the supported models, the returned embedding vector is normalized, meaning t Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. Performance also heavily depends on image size, because resize is expansive operation, especially on low-end devices. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 48 | 64 | 69 | 65 | 63 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 70 | 70 | 90 | 66 | 58 | :::info Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. diff --git a/docs/docs/02-hooks/02-computer-vision/useOCR.md b/docs/docs/02-hooks/02-computer-vision/useOCR.md index 037daebf7..08e28f829 100644 --- a/docs/docs/02-hooks/02-computer-vision/useOCR.md +++ b/docs/docs/02-hooks/02-computer-vision/useOCR.md @@ -288,20 +288,20 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc ### Model size -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_800) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_256) | 16 - 18\* | -| Recognizer (CRNN_128) | 17 - 19\* | +| Model | XNNPACK [MB] | +| ------------------------------ | :----------: | +| Detector (CRAFT_800_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_256) | 16 - 18\* | +| Recognizer (CRNN_128) | 17 - 19\* | \* - The model weights vary depending on the language. ### Memory usage -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_800) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1600 | 1700 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------------------------------------------------------------------------ | :--------------------: | :----------------: | +| Detector (CRAFT_800_QUANTIZED) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1400 | 1320 | ### Inference time @@ -317,18 +317,16 @@ Times presented in the tables are measured as consecutive runs of the model. Ini **Time measurements:** -| Metric | iPhone 14 Pro Max
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | -| ------------------------- | ----------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | -| **Total Inference Time** | 4330 | 2537 | ❌ | 6648 | 5993 | -| **Detector (CRAFT_800)** | 1945 | 1809 | ❌ | 2080 | 1961 | -| **Recognizer (CRNN_512)** | | | | | | -| ├─ Average Time | 273 | 76 | ❌ | 289 | 252 | -| ├─ Total Time (3 runs) | 820 | 229 | ❌ | 867 | 756 | -| **Recognizer (CRNN_256)** | | | | | | -| ├─ Average Time | 137 | 39 | ❌ | 260 | 229 | -| ├─ Total Time (7 runs) | 958 | 271 | ❌ | 1818 | 1601 | -| **Recognizer (CRNN_128)** | | | | | | -| ├─ Average Time | 68 | 18 | ❌ | 239 | 214 | -| ├─ Total Time (7 runs) | 478 | 124 | ❌ | 1673 | 1498 | - -❌ - Insufficient RAM. +| Metric | iPhone 17 Pro
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | +| ---------------------------------- | ------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | +| **Total Inference Time** | 1160 | 1144 | 1498 | 1567 | 1160 | +| **Detector (CRAFT_800_QUANTIZED)** | 669 | 649 | 825 | 541 | 474 | +| **Recognizer (CRNN_512)** | | | | | | +| ├─ Average Time | 48 | 47 | 60 | 91 | 72 | +| ├─ Total Time (3 runs) | 144 | 141 | 180 | 273 | 216 | +| **Recognizer (CRNN_256)** | | | | | | +| ├─ Average Time | 22 | 22 | 29 | 51 | 30 | +| ├─ Total Time (7 runs) | 154 | 154 | 203 | 357 | 210 | +| **Recognizer (CRNN_128)** | | | | | | +| ├─ Average Time | 11 | 11 | 14 | 28 | 17 | +| ├─ Total Time (7 runs) | 77 | 77 | 98 | 196 | 119 | diff --git a/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md b/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md index ac756d6a6..7f49e8389 100644 --- a/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md +++ b/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md @@ -139,7 +139,7 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------------------------ | :--------------------: | :----------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 90 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 164 | 132 | ### Inference time @@ -147,6 +147,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 116 | 120 | 164 | 257 | 129 | diff --git a/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md b/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md index 899a619ca..2bedba325 100644 --- a/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md +++ b/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md @@ -95,10 +95,10 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ---------------------------- | :--------------------: | :----------------: | -| STYLE_TRANSFER_CANDY | 950 | 350 | -| STYLE_TRANSFER_MOSAIC | 950 | 350 | -| STYLE_TRANSFER_UDNIE | 950 | 350 | -| STYLE_TRANSFER_RAIN_PRINCESS | 950 | 350 | +| STYLE_TRANSFER_CANDY | 1200 | 380 | +| STYLE_TRANSFER_MOSAIC | 1200 | 380 | +| STYLE_TRANSFER_UDNIE | 1200 | 380 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1200 | 380 | ### Inference time @@ -106,9 +106,9 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1356 | 1550 | 2003 | 2578 | 2328 | +| STYLE_TRANSFER_MOSAIC | 1376 | 1456 | 1971 | 2657 | 2394 | +| STYLE_TRANSFER_UDNIE | 1389 | 1499 | 1858 | 2380 | 2124 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1339 | 1514 | 2004 | 2608 | 2371 | diff --git a/docs/docs/02-hooks/02-computer-vision/useTextToImage.md b/docs/docs/02-hooks/02-computer-vision/useTextToImage.md index 83e47a3e2..3eaf7d826 100644 --- a/docs/docs/02-hooks/02-computer-vision/useTextToImage.md +++ b/docs/docs/02-hooks/02-computer-vision/useTextToImage.md @@ -124,9 +124,9 @@ The number following the underscore (\_) indicates that the model supports gener ### Inference time -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :-------------------: | :-------------------------------: | :-----------------------: | -| BK_SDM_TINY_VPRED_256 | 19100 | 25000 | ❌ | ❌ | 23100 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| BK_SDM_TINY_VPRED_256 | 21184 | 21021 | ❌ | 18834 | 16617 | :::info Text-to-image benchmark times are measured generating 256×256 images in 10 inference steps. diff --git a/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md b/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md index 29a4de452..94e5e3054 100644 --- a/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md +++ b/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md @@ -302,12 +302,12 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc ### Model size -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_1280) | 83.1 | -| Detector (CRAFT_320) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_64) | 15 - 16\* | +| Model | XNNPACK [MB] | +| ------------------------------- | :----------: | +| Detector (CRAFT_1280_QUANTIZED) | 19.8 | +| Detector (CRAFT_32_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_64) | 15 - 16\* | \* - The model weights vary depending on the language. @@ -315,8 +315,8 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 2172 | 2214 | -| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1774 | 1705 | +| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 1540 | 1470 | +| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1070 | 1000 | ### Inference time @@ -332,18 +332,16 @@ Times presented in the tables are measured as consecutive runs of the model. Ini **Time measurements:** -| Metric | iPhone 14 Pro Max
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | -| -------------------------------------------------------------------------- | ----------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | -| **Total Inference Time** | 9350 / 9620 | 8572 / 8621 | ❌ | 13737 / 10570 | 13436 / 9848 | -| **Detector (CRAFT_1250)** | 4895 | 4756 | ❌ | 5574 | 5016 | -| **Detector (CRAFT_320)** | | | | | | -| ├─ Average Time | 1247 | 1206 | ❌ | 1350 | 1356 | -| ├─ Total Time (3 runs) | 3741 | 3617 | ❌ | 4050 | 4069 | -| **Recognizer (CRNN_64)**
(_With Flag `independentChars == true`_) | | | | | | -| ├─ Average Time | 31 | 9 | ❌ | 195 | 207 | -| ├─ Total Time (21 runs) | 649 | 191 | ❌ | 4092 | 4339 | -| **Recognizer (CRNN_512)**
(_With Flag `independentChars == false`_) | | | | | | -| ├─ Average Time | 306 | 80 | ❌ | 308 | 250 | -| ├─ Total Time (3 runs) | 919 | 240 | ❌ | 925 | 751 | - -❌ - Insufficient RAM. +| Metric | iPhone 17 Pro
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | +| -------------------------------------------------------------------------- | ------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | +| **Total Inference Time** | 3819 / 3716 | 3978 / 3841 | 4751 / 4532 | 3095 / 3286 | 2787 / 2770 | +| **Detector (CRAFT_1280_QUANTIZED)** | 1749 | 1804 | 2105 | 1216 | 1171 | +| **Detector (CRAFT_320_QUANTIZED)** | | | | | | +| ├─ Average Time | 458 | 474 | 561 | 360 | 332 | +| ├─ Total Time (4 runs) | 1832 | 1896 | 2244 | 1440 | 1328 | +| **Recognizer (CRNN_64)**
(_With Flag `independentChars == true`_) | | | | | | +| ├─ Average Time | 5 | 6 | 7 | 28 | 11 | +| ├─ Total Time (21 runs) | 105 | 126 | 147 | 588 | 231 | +| **Recognizer (CRNN_512)**
(_With Flag `independentChars == false`_) | | | | | | +| ├─ Average Time | 54 | 52 | 68 | 144 | 72 | +| ├─ Total Time (4 runs) | 216 | 208 | 272 | 576 | 288 | diff --git a/docs/versioned_docs/version-0.4.x/computer-vision/useClassification.md b/docs/versioned_docs/version-0.4.x/computer-vision/useClassification.md index fb812fb57..caef31b3d 100644 --- a/docs/versioned_docs/version-0.4.x/computer-vision/useClassification.md +++ b/docs/versioned_docs/version-0.4.x/computer-vision/useClassification.md @@ -85,8 +85,8 @@ function App() { ## Supported models -| Model | Number of classes | Class list | -| --------------------------------------------------------------------------------------------------------------- | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Model | Number of classes | Class list | +| ----------------------------------------------------------------------------------------------------------------- | ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | [efficientnet_v2_s](https://pytorch.org/vision/stable/models/generated/torchvision.models.efficientnet_v2_s.html) | 1000 | [ImageNet1k_v1](https://github.com/software-mansion/react-native-executorch/blob/release/0.4/android/src/main/java/com/swmansion/rnexecutorch/models/classification/Constants.kt) | ## Benchmarks @@ -109,6 +109,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 150 | 161 | 227 | 196 | 214 | diff --git a/docs/versioned_docs/version-0.4.x/computer-vision/useOCR.md b/docs/versioned_docs/version-0.4.x/computer-vision/useOCR.md index 2c12300e8..960815719 100644 --- a/docs/versioned_docs/version-0.4.x/computer-vision/useOCR.md +++ b/docs/versioned_docs/version-0.4.x/computer-vision/useOCR.md @@ -321,11 +321,9 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_800) | 2099 | 2227 | ❌ | 2245 | 7108 | -| Recognizer (CRNN_512) | 70 | 252 | ❌ | 54 | 151 | -| Recognizer (CRNN_256) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_128) | 17 | 83 | ❌ | 14 | 39 | - -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_800_QUANTIZED) | 779 | 897 | 1276 | 553 | 586 | +| Recognizer (CRNN_512) | 77 | 74 | 244 | 56 | 57 | +| Recognizer (CRNN_256) | 35 | 37 | 120 | 28 | 30 | +| Recognizer (CRNN_128) | 18 | 19 | 60 | 14 | 16 | diff --git a/docs/versioned_docs/version-0.4.x/computer-vision/useObjectDetection.md b/docs/versioned_docs/version-0.4.x/computer-vision/useObjectDetection.md index b18faa8f8..0bdbeef01 100644 --- a/docs/versioned_docs/version-0.4.x/computer-vision/useObjectDetection.md +++ b/docs/versioned_docs/version-0.4.x/computer-vision/useObjectDetection.md @@ -145,6 +145,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 261 | 279 | 414 | 125 | 115 | diff --git a/docs/versioned_docs/version-0.4.x/computer-vision/useStyleTransfer.md b/docs/versioned_docs/version-0.4.x/computer-vision/useStyleTransfer.md index 40f30a1d0..09599bac7 100644 --- a/docs/versioned_docs/version-0.4.x/computer-vision/useStyleTransfer.md +++ b/docs/versioned_docs/version-0.4.x/computer-vision/useStyleTransfer.md @@ -107,9 +107,9 @@ function App(){ Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_MOSAIC | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_UDNIE | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1565 | 1675 | 2325 | 1750 | 1620 | diff --git a/docs/versioned_docs/version-0.4.x/computer-vision/useVerticalOCR.md b/docs/versioned_docs/version-0.4.x/computer-vision/useVerticalOCR.md index 98cc301bf..ce9b456e3 100644 --- a/docs/versioned_docs/version-0.4.x/computer-vision/useVerticalOCR.md +++ b/docs/versioned_docs/version-0.4.x/computer-vision/useVerticalOCR.md @@ -342,11 +342,9 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_1280) | 5457 | 5833 | ❌ | 6296 | 14053 | -| Detector (CRAFT_320) | 1351 | 1460 | ❌ | 1485 | 3101 | -| Recognizer (CRNN_512) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_64) | 10 | 33 | ❌ | 7 | 18 | - -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_1280_QUANTIZED) | 1918 | 2304 | 3371 | 1391 | 1445 | +| Detector (CRAFT_320_QUANTIZED) | 473 | 563 | 813 | 361 | 382 | +| Recognizer (CRNN_512) | 78 | 83 | 310 | 59 | 57 | +| Recognizer (CRNN_64) | 9 | 9 | 38 | 8 | 7 | diff --git a/docs/versioned_docs/version-0.4.x/natural-language-processing/useTextEmbeddings.md b/docs/versioned_docs/version-0.4.x/natural-language-processing/useTextEmbeddings.md index 43fefe3d6..5aeeaa02b 100644 --- a/docs/versioned_docs/version-0.4.x/natural-language-processing/useTextEmbeddings.md +++ b/docs/versioned_docs/version-0.4.x/natural-language-processing/useTextEmbeddings.md @@ -148,9 +148,9 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 53 | 69 | 78 | 60 | 65 | -| ALL_MPNET_BASE_V2 | 352 | 423 | 478 | 521 | 527 | -| MULTI_QA_MINILM_L6_COS_V1 | 135 | 166 | 180 | 158 | 165 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 503 | 598 | 680 | 694 | 743 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 50 | 58 | 84 | 58 | 58 | +| ALL_MPNET_BASE_V2 | 352 | 428 | 879 | 483 | 517 | +| MULTI_QA_MINILM_L6_COS_V1 | 133 | 161 | 269 | 151 | 155 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 502 | 796 | 1216 | 915 | 713 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useSpeechToText.md b/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useSpeechToText.md index 3256e2e88..d94c96a66 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useSpeechToText.md @@ -75,20 +75,20 @@ For more information on loading resources, take a look at [loading models](../.. ### Returns -| Field | Type | Description | -| --------------------------- | ---------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `transcribe` | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | -| `stream` | `() => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | -| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | -| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | -| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | -| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | -| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | -| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | -| `error` | `string \| null` | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Tracks the progress of the model download process. | +| Field | Type | Description | +| --------------------------- | ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `transcribe` | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | +| `stream` | `(options?: DecodingOptions \| undefined) => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | +| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | +| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | +| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | +| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | +| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | +| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | +| `error` | `string \| null` | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Tracks the progress of the model download process. |
Type definitions @@ -340,4 +340,4 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------ | :--------------------: | :----------------: | -| WHISPER_TINY | 900 | 600 | +| WHISPER_TINY | 410 | 375 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useTextEmbeddings.md b/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useTextEmbeddings.md index c40d19e94..fd595d208 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useTextEmbeddings.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useTextEmbeddings.md @@ -133,11 +133,11 @@ For the supported models, the returned embedding vector is normalized, meaning t | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------- | :--------------------: | :----------------: | -| ALL_MINILM_L6_V2 | 85 | 100 | -| ALL_MPNET_BASE_V2 | 390 | 465 | -| MULTI_QA_MINILM_L6_COS_V1 | 115 | 130 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 415 | 490 | -| CLIP_VIT_BASE_PATCH32_TEXT | 195 | 250 | +| ALL_MINILM_L6_V2 | 95 | 110 | +| ALL_MPNET_BASE_V2 | 405 | 455 | +| MULTI_QA_MINILM_L6_COS_V1 | 120 | 140 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 435 | 455 | +| CLIP_VIT_BASE_PATCH32_TEXT | 200 | 280 | ### Inference time @@ -145,13 +145,13 @@ For the supported models, the returned embedding vector is normalized, meaning t Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 15 | 22 | 23 | 36 | 31 | -| ALL_MPNET_BASE_V2 | 71 | 96 | 101 | 112 | 105 | -| MULTI_QA_MINILM_L6_COS_V1 | 15 | 22 | 23 | 36 | 31 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 71 | 95 | 100 | 112 | 105 | -| CLIP_VIT_BASE_PATCH32_TEXT | 31 | 47 | 48 | 55 | 49 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 16 | 16 | 19 | 54 | 28 | +| ALL_MPNET_BASE_V2 | 115 | 116 | 144 | 145 | 95 | +| MULTI_QA_MINILM_L6_COS_V1 | 16 | 16 | 20 | 47 | 28 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 112 | 119 | 144 | 146 | 96 | +| CLIP_VIT_BASE_PATCH32_TEXT | 47 | 45 | 57 | 65 | 48 | :::info Benchmark times for text embeddings are highly dependent on the sentence length. The numbers above are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useClassification.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useClassification.md index b4d3f34a6..e17bfa775 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useClassification.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useClassification.md @@ -100,7 +100,7 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ----------------- | :--------------------: | :----------------: | -| EFFICIENTNET_V2_S | 130 | 85 | +| EFFICIENTNET_V2_S | 230 | 87 | ### Inference time @@ -108,6 +108,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 105 | 110 | 149 | 299 | 227 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useImageEmbeddings.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useImageEmbeddings.md index 1849a95ce..4d417590c 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useImageEmbeddings.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useImageEmbeddings.md @@ -91,9 +91,9 @@ try { ## Supported models -| Model | Language | Image size | Embedding Dimensions | Description | +| Model | Language | Image size | Embedding dimensions | Description | | ---------------------------------------------------------------------------------- | :------: | :--------: | :------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [clip-vit-base-patch32-image](https://huggingface.co/openai/clip-vit-base-patch32) | English | 224 x 224 | 512 | CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. CLIP allows to embed images and text into the same vector space. This allows to find similar images as well as to implement image search. This is the image encoder part of the CLIP model. To embed text checkout [clip-vit-base-patch32-text](../01-natural-language-processing/useTextEmbeddings.md#supported-models). | +| [clip-vit-base-patch32-image](https://huggingface.co/openai/clip-vit-base-patch32) | English | 224×224 | 512 | CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. CLIP allows to embed images and text into the same vector space. This allows to find similar images as well as to implement image search. This is the image encoder part of the CLIP model. To embed text checkout [clip-vit-base-patch32-text](../01-natural-language-processing/useTextEmbeddings.md#supported-models). | **`Image size`** - the size of an image that the model takes as an input. Resize will happen automatically. @@ -123,9 +123,9 @@ For the supported models, the returned embedding vector is normalized, meaning t Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. Performance also heavily depends on image size, because resize is expansive operation, especially on low-end devices. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 48 | 64 | 69 | 65 | 63 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 70 | 70 | 90 | 66 | 58 | :::info Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useOCR.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useOCR.md index a23acd17c..5a1e80cfc 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useOCR.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useOCR.md @@ -134,13 +134,13 @@ For more information on loading resources, take a look at [loading models](../.. The hook returns an object with the following properties: -| Field | Type | Description | -| ------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `OCRDetection` objects. | -| `error` | string | null | Contains the error message if the model loading failed. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | +| Field | Type | Description | +| ------------------ | -------------------------------------------------- | ------------------------------------------------------------------------------------------- | +| `forward` | `(imageSource: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `OCRDetection` objects. | +| `error` | string | null | Contains the error message if the model loading failed. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | ## Running the model @@ -288,20 +288,20 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc ### Model size -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_800) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_256) | 16 - 18\* | -| Recognizer (CRNN_128) | 17 - 19\* | +| Model | XNNPACK [MB] | +| ------------------------------ | :----------: | +| Detector (CRAFT_800_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_256) | 16 - 18\* | +| Recognizer (CRNN_128) | 17 - 19\* | \* - The model weights vary depending on the language. ### Memory usage -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_800) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1600 | 1700 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------------------------------------------------------------------------ | :--------------------: | :----------------: | +| Detector (CRAFT_800_QUANTIZED) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1400 | 1320 | ### Inference time @@ -317,18 +317,16 @@ Times presented in the tables are measured as consecutive runs of the model. Ini **Time measurements:** -| Metric | iPhone 14 Pro Max
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | -| ------------------------- | ----------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | -| **Total Inference Time** | 4330 | 2537 | ❌ | 6648 | 5993 | -| **Detector (CRAFT_800)** | 1945 | 1809 | ❌ | 2080 | 1961 | -| **Recognizer (CRNN_512)** | | | | | | -| ├─ Average Time | 273 | 76 | ❌ | 289 | 252 | -| ├─ Total Time (3 runs) | 820 | 229 | ❌ | 867 | 756 | -| **Recognizer (CRNN_256)** | | | | | | -| ├─ Average Time | 137 | 39 | ❌ | 260 | 229 | -| ├─ Total Time (7 runs) | 958 | 271 | ❌ | 1818 | 1601 | -| **Recognizer (CRNN_128)** | | | | | | -| ├─ Average Time | 68 | 18 | ❌ | 239 | 214 | -| ├─ Total Time (7 runs) | 478 | 124 | ❌ | 1673 | 1498 | - -❌ - Insufficient RAM. +| Metric | iPhone 17 Pro
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | +| ---------------------------------- | ------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | +| **Total Inference Time** | 1160 | 1144 | 1498 | 1567 | 1160 | +| **Detector (CRAFT_800_QUANTIZED)** | 669 | 649 | 825 | 541 | 474 | +| **Recognizer (CRNN_512)** | | | | | | +| ├─ Average Time | 48 | 47 | 60 | 91 | 72 | +| ├─ Total Time (3 runs) | 144 | 141 | 180 | 273 | 216 | +| **Recognizer (CRNN_256)** | | | | | | +| ├─ Average Time | 22 | 22 | 29 | 51 | 30 | +| ├─ Total Time (7 runs) | 154 | 154 | 203 | 357 | 210 | +| **Recognizer (CRNN_128)** | | | | | | +| ├─ Average Time | 11 | 11 | 14 | 28 | 17 | +| ├─ Total Time (7 runs) | 77 | 77 | 98 | 196 | 119 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useObjectDetection.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useObjectDetection.md index ac756d6a6..7f49e8389 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useObjectDetection.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useObjectDetection.md @@ -139,7 +139,7 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------------------------ | :--------------------: | :----------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 90 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 164 | 132 | ### Inference time @@ -147,6 +147,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 116 | 120 | 164 | 257 | 129 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useStyleTransfer.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useStyleTransfer.md index 899a619ca..2bedba325 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useStyleTransfer.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useStyleTransfer.md @@ -95,10 +95,10 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ---------------------------- | :--------------------: | :----------------: | -| STYLE_TRANSFER_CANDY | 950 | 350 | -| STYLE_TRANSFER_MOSAIC | 950 | 350 | -| STYLE_TRANSFER_UDNIE | 950 | 350 | -| STYLE_TRANSFER_RAIN_PRINCESS | 950 | 350 | +| STYLE_TRANSFER_CANDY | 1200 | 380 | +| STYLE_TRANSFER_MOSAIC | 1200 | 380 | +| STYLE_TRANSFER_UDNIE | 1200 | 380 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1200 | 380 | ### Inference time @@ -106,9 +106,9 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1356 | 1550 | 2003 | 2578 | 2328 | +| STYLE_TRANSFER_MOSAIC | 1376 | 1456 | 1971 | 2657 | 2394 | +| STYLE_TRANSFER_UDNIE | 1389 | 1499 | 1858 | 2380 | 2124 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1339 | 1514 | 2004 | 2608 | 2371 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useTextToImage.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useTextToImage.md new file mode 100644 index 000000000..476f8d95d --- /dev/null +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useTextToImage.md @@ -0,0 +1,133 @@ +--- +title: useTextToImage +keywords: [image generation] +description: "Learn how to use image generation models in your React Native applications with React Native ExecuTorch's useTextToImage hook." +--- + +Text-to-image is a process of generating images directly from a description in natural language by conditioning a model on the provided text input. Our implementation follows the Stable Diffusion pipeline, which applies the diffusion process in a lower-dimensional latent space to reduce memory requirements. The pipeline combines a text encoder to preprocess the prompt, a U-Net that iteratively denoises latent representations, and a VAE decoder to reconstruct the final image. React Native ExecuTorch offers a dedicated hook, `useTextToImage`, for this task. + + + +:::warning +It is recommended to use models provided by us which are available at our Hugging Face repository, you can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library. +::: + +## Reference + +```typescript +import { useTextToImage, BK_SDM_TINY_VPRED_256 } from 'react-native-executorch'; + +const model = useTextToImage({ model: BK_SDM_TINY_VPRED_256 }); + +const input = 'a castle'; + +try { + const image = await model.generate(input); +} catch (error) { + console.error(error); +} +``` + +### Arguments + +**`model`** - Object containing the model source. + +- **`schedulerSource`** - A string that specifies the location of the scheduler config. + +- **`tokenizerSource`** - A string that specifies the location of the tokenizer config. + +- **`encoderSource`** - A string that specifies the location of the text encoder binary. + +- **`unetSource`** - A string that specifies the location of the U-Net binary. + +- **`decoderSource`** - A string that specifies the location of the VAE decoder binary. + +**`preventLoad?`** - Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook. + +For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page. + +### Returns + +| Field | Type | Description | +| ------------------ | ------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `generate` | `(input: string, imageSize?: number, numSteps?: number, seed?: number) => Promise` | Runs the model to generate an image described by `input`, and conditioned by `seed`, performing `numSteps` inference steps. The resulting image, with dimensions `imageSize`×`imageSize` pixels, is returned as a base64-encoded string. | +| `error` | string | null | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | +| `interrupt()` | `() => void` | Interrupts the current inference. The model is stopped in the nearest inference step. | + +## Running the model + +To run the model, you can use the `forward` method. It accepts four arguments: a text prompt describing the requested image, a size of the image in pixels, a number of denoising steps, and an optional seed value, which enables reproducibility of the results. + +The image size must be a multiple of 32 due to the architecture of the U-Net and VAE models. The seed should be a positive integer. + +:::warning +Larger imageSize values require significantly more memory to run the model. +::: + +## Example + +```tsx +import { useTextToImage, BK_SDM_TINY_VPRED_256 } from 'react-native-executorch'; + +function App() { + const model = useTextToImage({ model: BK_SDM_TINY_VPRED_256 }); + + //... + const input = 'a medieval castle by the sea shore'; + + const imageSize = 256; + const numSteps = 25; + + try { + image = await model.generate(input, imageSize, numSteps); + } catch (error) { + console.error(error); + } + //... + + return ; +} +``` + +| ![Castle 256x256](../../../../static/img/castle256.png) | ![Castle 512x512](../../../../static/img/castle512.png) | +| ------------------------------------------------------- | ------------------------------------------------------- | +| Image of size 256×256 | Image of size 512×512 | + +## Supported models + +| Model | Parameters [B] | Description | +| ------------------------------------------------------------------- | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [bk-sdm-tiny-vpred](https://huggingface.co/vivym/bk-sdm-tiny-vpred) | 0.5 | BK-SDM (Block-removed Knowledge-distilled Stable Diffusion Model) is a compressed version of Stable Diffusion v1.4 with several residual and attention blocks removed. The BK-SDM-Tiny is a v-prediction variant of the model, obtained through further block removal, built around a 0.33B-parameter U-Net. | + +## Benchmarks + +:::info +The number following the underscore (\_) indicates that the model supports generating image with dimensions ranging from 128 pixels up to that value. This setting doesn’t affect the model’s file size - it only determines how memory is allocated at runtime, based on the maximum allowed image size. +::: + +### Model size + +| Model | Text encoder (XNNPACK) [MB] | UNet (XNNPACK) [MB] | VAE decoder (XNNPACK) [MB] | +| --------------------- | --------------------------- | ------------------- | -------------------------- | +| BK_SDM_TINY_VPRED_256 | 492 | 1290 | 198 | +| BK_SDM_TINY_VPRED_512 | 492 | 1290 | 198 | + +### Memory usage + +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| --------------------- | ---------------------- | ------------------ | +| BK_SDM_TINY_VPRED_256 | 2900 | 2800 | +| BK_SDM_TINY_VPRED_512 | 6700 | 6560 | + +### Inference time + +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| BK_SDM_TINY_VPRED_256 | 21184 | 21021 | ❌ | 18834 | 16617 | + +:::info +Text-to-image benchmark times are measured generating 256×256 images in 10 inference steps. +::: diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useVerticalOCR.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useVerticalOCR.md index e15c08fbe..73c3fc108 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useVerticalOCR.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useVerticalOCR.md @@ -147,13 +147,13 @@ For more information on loading resources, take a look at [loading models](../.. The hook returns an object with the following properties: -| Field | Type | Description | -| ------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `OCRDetection` objects. | -| `error` | string | null | Contains the error message if the model loading failed. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | +| Field | Type | Description | +| ------------------ | -------------------------------------------------- | ------------------------------------------------------------------------------------------- | +| `forward` | `(imageSource: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `OCRDetection` objects. | +| `error` | string | null | Contains the error message if the model loading failed. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | ## Running the model @@ -302,12 +302,12 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc ### Model size -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_1280) | 83.1 | -| Detector (CRAFT_320) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_64) | 15 - 16\* | +| Model | XNNPACK [MB] | +| ------------------------------- | :----------: | +| Detector (CRAFT_1280_QUANTIZED) | 19.8 | +| Detector (CRAFT_32_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_64) | 15 - 16\* | \* - The model weights vary depending on the language. @@ -315,8 +315,8 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 2172 | 2214 | -| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1774 | 1705 | +| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 1540 | 1470 | +| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1070 | 1000 | ### Inference time @@ -332,18 +332,16 @@ Times presented in the tables are measured as consecutive runs of the model. Ini **Time measurements:** -| Metric | iPhone 14 Pro Max
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | -| -------------------------------------------------------------------------- | ----------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | -| **Total Inference Time** | 9350 / 9620 | 8572 / 8621 | ❌ | 13737 / 10570 | 13436 / 9848 | -| **Detector (CRAFT_1250)** | 4895 | 4756 | ❌ | 5574 | 5016 | -| **Detector (CRAFT_320)** | | | | | | -| ├─ Average Time | 1247 | 1206 | ❌ | 1350 | 1356 | -| ├─ Total Time (3 runs) | 3741 | 3617 | ❌ | 4050 | 4069 | -| **Recognizer (CRNN_64)**
(_With Flag `independentChars == true`_) | | | | | | -| ├─ Average Time | 31 | 9 | ❌ | 195 | 207 | -| ├─ Total Time (21 runs) | 649 | 191 | ❌ | 4092 | 4339 | -| **Recognizer (CRNN_512)**
(_With Flag `independentChars == false`_) | | | | | | -| ├─ Average Time | 306 | 80 | ❌ | 308 | 250 | -| ├─ Total Time (3 runs) | 919 | 240 | ❌ | 925 | 751 | - -❌ - Insufficient RAM. +| Metric | iPhone 17 Pro
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | +| -------------------------------------------------------------------------- | ------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | +| **Total Inference Time** | 3819 / 3716 | 3978 / 3841 | 4751 / 4532 | 3095 / 3286 | 2787 / 2770 | +| **Detector (CRAFT_1280_QUANTIZED)** | 1749 | 1804 | 2105 | 1216 | 1171 | +| **Detector (CRAFT_320_QUANTIZED)** | | | | | | +| ├─ Average Time | 458 | 474 | 561 | 360 | 332 | +| ├─ Total Time (4 runs) | 1832 | 1896 | 2244 | 1440 | 1328 | +| **Recognizer (CRNN_64)**
(_With Flag `independentChars == true`_) | | | | | | +| ├─ Average Time | 5 | 6 | 7 | 28 | 11 | +| ├─ Total Time (21 runs) | 105 | 126 | 147 | 588 | 231 | +| **Recognizer (CRNN_512)**
(_With Flag `independentChars == false`_) | | | | | | +| ├─ Average Time | 54 | 52 | 68 | 144 | 72 | +| ├─ Total Time (4 runs) | 216 | 208 | 272 | 576 | 288 | From 2d1ac228d6ff5eea8b82489c0c8f2a74906ef3c0 Mon Sep 17 00:00:00 2001 From: Mateusz Kopcinski <120639731+mkopcins@users.noreply.github.com> Date: Wed, 3 Dec 2025 11:20:11 +0100 Subject: [PATCH 05/11] feat: Remove stft calculation within the encoder (#658) ## Description The Whisper model export now takes in a plain waveform instead of pre-computed STFT. This PR aims to change the current API to accept waveforms instead. Before merging this, make sure to re-export all the existing Whisper models with the new export script. ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [ ] Bug fix (change which fixes an issue) - [x] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [x] Android ### Testing instructions ### Screenshots ### Related issues ### Checklist - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [ ] My changes generate no new warnings ### Additional notes --------- Co-authored-by: chmjkb Co-authored-by: Jakub Chmura <92989966+chmjkb@users.noreply.github.com> Co-authored-by: IgorSwat <114943112+IgorSwat@users.noreply.github.com> --- .../rnexecutorch/data_processing/dsp.cpp | 46 ------------------- .../common/rnexecutorch/models/BaseModel.cpp | 12 ++--- .../common/rnexecutorch/models/BaseModel.h | 17 ++++--- .../models/speech_to_text/asr/ASR.cpp | 31 +++++-------- .../models/speech_to_text/asr/ASR.h | 9 ++-- .../VoiceActivityDetection.cpp | 3 +- .../src/constants/modelUrls.ts | 46 +++++++++++-------- 7 files changed, 61 insertions(+), 103 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/data_processing/dsp.cpp b/packages/react-native-executorch/common/rnexecutorch/data_processing/dsp.cpp index d3761dced..b1c8714a2 100644 --- a/packages/react-native-executorch/common/rnexecutorch/data_processing/dsp.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/data_processing/dsp.cpp @@ -1,6 +1,4 @@ -#include #include -#include #include #include #include @@ -18,48 +16,4 @@ std::vector hannWindow(size_t size) { return window; } -std::vector stftFromWaveform(std::span waveform, - size_t fftWindowSize, size_t hopSize) { - // Initialize FFT - FFT fft(fftWindowSize); - - const auto numFrames = 1 + (waveform.size() - fftWindowSize) / hopSize; - const auto numBins = fftWindowSize / 2; - const auto hann = hannWindow(fftWindowSize); - auto inBuffer = std::vector(fftWindowSize); - auto outBuffer = std::vector>(fftWindowSize); - - // Output magnitudes in dB - std::vector magnitudes; - magnitudes.reserve(numFrames * numBins); - const auto magnitudeScale = 1.0f / static_cast(fftWindowSize); - constexpr auto epsilon = std::numeric_limits::epsilon(); - constexpr auto dbConversionFactor = 20.0f; - - for (size_t t = 0; t < numFrames; ++t) { - const size_t offset = t * hopSize; - // Clear the input buffer first - std::ranges::fill(inBuffer, 0.0f); - - // Fill frame with windowed signal - const size_t samplesToRead = - std::min(fftWindowSize, waveform.size() - offset); - for (size_t i = 0; i < samplesToRead; i++) { - inBuffer[i] = waveform[offset + i] * hann[i]; - } - - fft.doFFT(inBuffer.data(), outBuffer); - - // Calculate magnitudes in dB (only positive frequencies) - for (size_t i = 0; i < numBins; i++) { - const auto magnitude = std::abs(outBuffer[i]) * magnitudeScale; - const auto magnitude_db = - dbConversionFactor * log10f(magnitude + epsilon); - magnitudes.push_back(magnitude_db); - } - } - - return magnitudes; -} - } // namespace rnexecutorch::dsp diff --git a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp index a1194de69..ee53c7d5a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp @@ -30,7 +30,7 @@ BaseModel::BaseModel(const std::string &modelSource, } std::vector BaseModel::getInputShape(std::string method_name, - int32_t index) { + int32_t index) const { if (!module_) { throw std::runtime_error("Model not loaded: Cannot get input shape"); } @@ -56,7 +56,7 @@ std::vector BaseModel::getInputShape(std::string method_name, } std::vector> -BaseModel::getAllInputShapes(std::string methodName) { +BaseModel::getAllInputShapes(std::string methodName) const { if (!module_) { throw std::runtime_error("Model not loaded: Cannot get all input shapes"); } @@ -88,7 +88,7 @@ BaseModel::getAllInputShapes(std::string methodName) { /// to JS. It is not meant to be used within C++. If you want to call forward /// from C++ on a BaseModel, please use BaseModel::forward. std::vector -BaseModel::forwardJS(std::vector tensorViewVec) { +BaseModel::forwardJS(std::vector tensorViewVec) const { if (!module_) { throw std::runtime_error("Model not loaded: Cannot perform forward pass"); } @@ -136,7 +136,7 @@ BaseModel::forwardJS(std::vector tensorViewVec) { } Result -BaseModel::getMethodMeta(const std::string &methodName) { +BaseModel::getMethodMeta(const std::string &methodName) const { if (!module_) { throw std::runtime_error("Model not loaded: Cannot get method meta!"); } @@ -161,7 +161,7 @@ BaseModel::forward(const std::vector &input_evalues) const { Result> BaseModel::execute(const std::string &methodName, - const std::vector &input_value) { + const std::vector &input_value) const { if (!module_) { throw std::runtime_error("Model not loaded, cannot run execute."); } @@ -175,7 +175,7 @@ std::size_t BaseModel::getMemoryLowerBound() const noexcept { void BaseModel::unload() noexcept { module_.reset(nullptr); } std::vector -BaseModel::getTensorShape(const executorch::aten::Tensor &tensor) { +BaseModel::getTensorShape(const executorch::aten::Tensor &tensor) const { auto sizes = tensor.sizes(); return std::vector(sizes.begin(), sizes.end()); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h index b7b7b54ed..cf2940429 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h @@ -25,18 +25,20 @@ class BaseModel { Module::LoadMode loadMode = Module::LoadMode::MmapUseMlockIgnoreErrors); std::size_t getMemoryLowerBound() const noexcept; void unload() noexcept; - std::vector getInputShape(std::string method_name, int32_t index); + std::vector getInputShape(std::string method_name, + int32_t index) const; std::vector> - getAllInputShapes(std::string methodName = "forward"); + getAllInputShapes(std::string methodName = "forward") const; std::vector - forwardJS(std::vector tensorViewVec); + forwardJS(std::vector tensorViewVec) const; Result> forward(const EValue &input_value) const; Result> forward(const std::vector &input_value) const; - Result> execute(const std::string &methodName, - const std::vector &input_value); + Result> + execute(const std::string &methodName, + const std::vector &input_value) const; Result - getMethodMeta(const std::string &methodName); + getMethodMeta(const std::string &methodName) const; protected: // If possible, models should not use the JS runtime to keep JSI internals @@ -49,7 +51,8 @@ class BaseModel { std::size_t memorySizeLowerBound{0}; private: - std::vector getTensorShape(const executorch::aten::Tensor &tensor); + std::vector + getTensorShape(const executorch::aten::Tensor &tensor) const; }; } // namespace models diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp index d0f965cb3..bf8f9fb86 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp @@ -4,7 +4,6 @@ #include "ASR.h" #include "executorch/extension/tensor/tensor_ptr.h" #include "rnexecutorch/data_processing/Numerical.h" -#include "rnexecutorch/data_processing/dsp.h" #include "rnexecutorch/data_processing/gzip.h" namespace rnexecutorch::models::speech_to_text::asr { @@ -37,8 +36,7 @@ ASR::getInitialSequence(const DecodingOptions &options) const { return seq; } -GenerationResult ASR::generate(std::span waveform, - float temperature, +GenerationResult ASR::generate(std::span waveform, float temperature, const DecodingOptions &options) const { std::vector encoderOutput = this->encode(waveform); @@ -94,7 +92,7 @@ float ASR::getCompressionRatio(const std::string &text) const { } std::vector -ASR::generateWithFallback(std::span waveform, +ASR::generateWithFallback(std::span waveform, const DecodingOptions &options) const { std::vector temperatures = {0.0f, 0.2f, 0.4f, 0.6f, 0.8f, 1.0f}; std::vector bestTokens; @@ -209,7 +207,7 @@ ASR::estimateWordLevelTimestampsLinear(std::span tokens, return wordObjs; } -std::vector ASR::transcribe(std::span waveform, +std::vector ASR::transcribe(std::span waveform, const DecodingOptions &options) const { int32_t seek = 0; std::vector results; @@ -218,7 +216,7 @@ std::vector ASR::transcribe(std::span waveform, int32_t start = seek * ASR::kSamplingRate; const auto end = std::min( (seek + ASR::kChunkSize) * ASR::kSamplingRate, waveform.size()); - std::span chunk = waveform.subspan(start, end - start); + auto chunk = waveform.subspan(start, end - start); if (std::cmp_less(chunk.size(), ASR::kMinChunkSamples)) { break; @@ -246,19 +244,12 @@ std::vector ASR::transcribe(std::span waveform, return results; } -std::vector ASR::encode(std::span waveform) const { - constexpr int32_t fftWindowSize = 512; - constexpr int32_t stftHopLength = 160; - constexpr int32_t innerDim = 256; - - std::vector preprocessedData = - dsp::stftFromWaveform(waveform, fftWindowSize, stftHopLength); - const auto numFrames = - static_cast(preprocessedData.size()) / innerDim; - std::vector inputShape = {numFrames, innerDim}; +std::vector ASR::encode(std::span waveform) const { + auto inputShape = {static_cast(waveform.size())}; const auto modelInputTensor = executorch::extension::make_tensor_ptr( - std::move(inputShape), std::move(preprocessedData)); + std::move(inputShape), waveform.data(), + executorch::runtime::etensor::ScalarType::Float); const auto encoderResult = this->encoder->forward(modelInputTensor); if (!encoderResult.ok()) { @@ -268,7 +259,7 @@ std::vector ASR::encode(std::span waveform) const { } const auto decoderOutputTensor = encoderResult.get().at(0).toTensor(); - const int32_t outputNumel = decoderOutputTensor.numel(); + const auto outputNumel = decoderOutputTensor.numel(); const float *const dataPtr = decoderOutputTensor.const_data_ptr(); return {dataPtr, dataPtr + outputNumel}; @@ -277,8 +268,10 @@ std::vector ASR::encode(std::span waveform) const { std::vector ASR::decode(std::span tokens, std::span encoderOutput) const { std::vector tokenShape = {1, static_cast(tokens.size())}; + auto tokensLong = std::vector(tokens.begin(), tokens.end()); + auto tokenTensor = executorch::extension::make_tensor_ptr( - std::move(tokenShape), tokens.data(), ScalarType::Int); + tokenShape, tokensLong.data(), ScalarType::Long); const auto encoderOutputSize = static_cast(encoderOutput.size()); std::vector encShape = {1, ASR::kNumFrames, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h index 20180ebe4..a0ea7e181 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h @@ -14,9 +14,9 @@ class ASR { const models::BaseModel *decoder, const TokenizerModule *tokenizer); std::vector - transcribe(std::span waveform, + transcribe(std::span waveform, const types::DecodingOptions &options) const; - std::vector encode(std::span waveform) const; + std::vector encode(std::span waveform) const; std::vector decode(std::span tokens, std::span encoderOutput) const; @@ -44,11 +44,10 @@ class ASR { std::vector getInitialSequence(const types::DecodingOptions &options) const; - types::GenerationResult generate(std::span waveform, - float temperature, + types::GenerationResult generate(std::span waveform, float temperature, const types::DecodingOptions &options) const; std::vector - generateWithFallback(std::span waveform, + generateWithFallback(std::span waveform, const types::DecodingOptions &options) const; std::vector calculateWordLevelTimestamps(std::span tokens, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp index d07dbfb3c..dbc974706 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include namespace rnexecutorch::models::voice_activity_detection { @@ -158,4 +157,4 @@ VoiceActivityDetection::postprocess(const std::vector &scores, return speechSegments; } -} // namespace rnexecutorch::models::voice_activity_detection \ No newline at end of file +} // namespace rnexecutorch::models::voice_activity_detection diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 57381cf15..e9fe9e4d9 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -307,29 +307,32 @@ export const STYLE_TRANSFER_UDNIE = { }; // S2T -const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_EN_ENCODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_encoder_xnnpack.pte`; -const WHISPER_TINY_EN_DECODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_decoder_xnnpack.pte`; +const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_TINY_EN_ENCODER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_encoder_xnnpack.pte`; +const WHISPER_TINY_EN_DECODER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_decoder_xnnpack.pte`; -const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_EN_ENCODER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_encoder_xnnpack.pte`; -const WHISPER_BASE_EN_DECODER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_decoder_xnnpack.pte`; +const WHISPER_TINY_EN_ENCODER_QUANTIZED = `${URL_PREFIX}-whisper-tiny-quantized.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_quantized_en_encoder_xnnpack.pte`; +const WHISPER_TINY_EN_DECODER_QUANTIZED = `${URL_PREFIX}-whisper-tiny-quantized.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_quantized_en_decoder_xnnpack.pte`; -const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_EN_ENCODER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_encoder_xnnpack.pte`; -const WHISPER_SMALL_EN_DECODER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_decoder_xnnpack.pte`; +const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_BASE_EN_ENCODER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_encoder_xnnpack.pte`; +const WHISPER_BASE_EN_DECODER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_decoder_xnnpack.pte`; -const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_ENCODER_MODEL = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_encoder_xnnpack.pte`; -const WHISPER_TINY_DECODER_MODEL = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_decoder_xnnpack.pte`; +const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_EN_ENCODER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_encoder_xnnpack.pte`; +const WHISPER_SMALL_EN_DECODER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_decoder_xnnpack.pte`; -const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_ENCODER_MODEL = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_encoder_xnnpack.pte`; -const WHISPER_BASE_DECODER_MODEL = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_decoder_xnnpack.pte`; +const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_TINY_ENCODER_MODEL = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_encoder_xnnpack.pte`; +const WHISPER_TINY_DECODER_MODEL = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_decoder_xnnpack.pte`; -const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_ENCODER_MODEL = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_encoder_xnnpack.pte`; -const WHISPER_SMALL_DECODER_MODEL = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_decoder_xnnpack.pte`; +const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_BASE_ENCODER_MODEL = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_encoder_xnnpack.pte`; +const WHISPER_BASE_DECODER_MODEL = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_decoder_xnnpack.pte`; + +const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_ENCODER_MODEL = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_encoder_xnnpack.pte`; +const WHISPER_SMALL_DECODER_MODEL = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_decoder_xnnpack.pte`; export const WHISPER_TINY_EN = { isMultilingual: false, @@ -338,6 +341,13 @@ export const WHISPER_TINY_EN = { tokenizerSource: WHISPER_TINY_EN_TOKENIZER, }; +export const WHISPER_TINY_EN_QUANTIZED = { + isMultilingual: false, + encoderSource: WHISPER_TINY_EN_ENCODER_QUANTIZED, + decoderSource: WHISPER_TINY_EN_DECODER_QUANTIZED, + tokenizerSource: WHISPER_TINY_EN_TOKENIZER, +}; + export const WHISPER_BASE_EN = { isMultilingual: false, encoderSource: WHISPER_BASE_EN_ENCODER, From 73a47a8df8bf20ab0f5ac32ab68a375f196c3d6b Mon Sep 17 00:00:00 2001 From: Jakub Chmura <92989966+chmjkb@users.noreply.github.com> Date: Wed, 3 Dec 2025 11:24:04 +0100 Subject: [PATCH 06/11] fix: Import Expo FS conditionally to work with Expo 54 (#699) ## Description Expo 54 introduces a new FileSystem API, deprecating the ones used in our codebase. The old APIs can still be accessed under `expo-file-system/legacy`. This is a temporary fix to work with old Expo versions. ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [x] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [x] Android ### Testing instructions ### Screenshots ### Related issues ### Checklist - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [ ] My changes generate no new warnings ### Additional notes --- .../src/constants/directories.ts | 4 ++- .../src/controllers/LLMController.ts | 7 ++++-- .../src/utils/ResourceFetcher.ts | 25 +++++++++++++++++-- .../src/utils/ResourceFetcherUtils.ts | 20 +++++++-------- 4 files changed, 40 insertions(+), 16 deletions(-) diff --git a/packages/react-native-executorch/src/constants/directories.ts b/packages/react-native-executorch/src/constants/directories.ts index ac20d04d8..3cc6e68a9 100644 --- a/packages/react-native-executorch/src/constants/directories.ts +++ b/packages/react-native-executorch/src/constants/directories.ts @@ -1,3 +1,5 @@ -import { documentDirectory } from 'expo-file-system'; +import { importLegacyExpoFSModules } from '../utils/ResourceFetcher'; + +const { documentDirectory } = importLegacyExpoFSModules(); export const RNEDirectory = `${documentDirectory}react-native-executorch/`; diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index bcc131eba..bbc113a76 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -1,9 +1,11 @@ import { ResourceSource } from '../types/common'; -import { ResourceFetcher } from '../utils/ResourceFetcher'; +import { + importLegacyExpoFSModules, + ResourceFetcher, +} from '../utils/ResourceFetcher'; import { ETError, getError } from '../Error'; import { Template } from '@huggingface/jinja'; import { DEFAULT_CHAT_CONFIG } from '../constants/llmDefaults'; -import { readAsStringAsync } from 'expo-file-system'; import { ChatConfig, GenerationConfig, @@ -14,6 +16,7 @@ import { } from '../types/llm'; import { parseToolCall } from '../utils/llm'; import { Logger } from '../common/Logger'; +const { readAsStringAsync } = importLegacyExpoFSModules(); export class LLMController { private nativeModule: any; diff --git a/packages/react-native-executorch/src/utils/ResourceFetcher.ts b/packages/react-native-executorch/src/utils/ResourceFetcher.ts index fa2fd8c09..efc16ab63 100644 --- a/packages/react-native-executorch/src/utils/ResourceFetcher.ts +++ b/packages/react-native-executorch/src/utils/ResourceFetcher.ts @@ -27,7 +27,27 @@ * - Implements linked list behavior via the `.next` attribute * - Automatically processes subsequent downloads when `.next` contains a valid resource */ -import { +import type * as FileSystemTypes from 'expo-file-system'; + +export function importLegacyExpoFSModules() { + let FileSystem: typeof FileSystemTypes; + + try { + const expoPkg = require('expo/package.json'); + const sdkVersion = expoPkg.version.split('.')[0]; + + if (Number(sdkVersion) > 53) { + FileSystem = require('expo-file-system/legacy'); + } else { + FileSystem = require('expo-file-system'); + } + } catch (e) { + throw new Error('Expo must be installed to use react-native-executorch'); + } + return FileSystem; +} + +const { cacheDirectory, copyAsync, createDownloadResumable, @@ -37,7 +57,8 @@ import { EncodingType, deleteAsync, readDirectoryAsync, -} from 'expo-file-system'; +} = importLegacyExpoFSModules(); + import { Asset } from 'expo-asset'; import { Platform } from 'react-native'; import { RNEDirectory } from '../constants/directories'; diff --git a/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts b/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts index 67d6edc9b..d36a9ba5e 100644 --- a/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts +++ b/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts @@ -1,16 +1,14 @@ -/** - * @internal - */ - -import { - DownloadResumable, - getInfoAsync, - makeDirectoryAsync, -} from 'expo-file-system'; +import type * as FileSystemTypes from 'expo-file-system'; import { RNEDirectory } from '../constants/directories'; import { ResourceSource } from '../types/common'; import { Asset } from 'expo-asset'; import { Logger } from '../common/Logger'; +import { importLegacyExpoFSModules } from './ResourceFetcher'; + +/** + * @internal + */ +const { getInfoAsync, makeDirectoryAsync } = importLegacyExpoFSModules(); export const enum HTTP_CODE { OK = 200, @@ -42,7 +40,7 @@ export interface ResourceSourceExtended { } export interface DownloadResource { - downloadResumable: DownloadResumable; + downloadResumable: FileSystemTypes.DownloadResumable; status: DownloadStatus; extendedInfo: ResourceSourceExtended; } @@ -75,7 +73,7 @@ export namespace ResourceFetcherUtils { let totalLength = 0; let previousFilesTotalLength = 0; for (const source of sources) { - const type = await ResourceFetcherUtils.getType(source); + const type = ResourceFetcherUtils.getType(source); let length = 0; try { if (type === SourceType.REMOTE_FILE && typeof source === 'string') { From 0da57dc7ff704a02ba6b44d25531beef7aea3c63 Mon Sep 17 00:00:00 2001 From: Jakub Chmura <92989966+chmjkb@users.noreply.github.com> Date: Wed, 3 Dec 2025 12:17:35 +0100 Subject: [PATCH 07/11] fix: prevent OpenCV from overriding our threading configuration (#700) ## Description We observed activity on all CPU cores despite manually configuring the thread pool. OpenCV's internal threading was activating all available cores, overriding our optimized thread configuration and resulting in worse performance. ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [x] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [x] Android ### Testing instructions ### Screenshots ### Related issues ### Checklist - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [ ] My changes generate no new warnings ### Additional notes --- .../rnexecutorch/RnExecutorchInstaller.cpp | 16 ---------------- .../rnexecutorch/threads/GlobalThreadPool.h | 4 ++++ 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp index 95f6e9e55..c25fbd13f 100644 --- a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp @@ -108,22 +108,6 @@ void RnExecutorchInstaller::injectJSIBindings( threads::utils::unsafeSetupThreadPool(); threads::GlobalThreadPool::initialize(); - -#if defined(__ANDROID__) && defined(__aarch64__) - auto num_of_perf_cores = - ::executorch::extension::cpuinfo::get_num_performant_cores(); - log(LOG_LEVEL::Info, "Detected ", num_of_perf_cores, " performant cores"); - // setting num_of_cores to floor(num_of_perf_cores / 2) + 1) because depending - // on cpu arch as when possible we want to leave at least 2 performant cores - // for other tasks (setting more actually results in drop of performance). For - // older devices (i.e. samsung s22) resolves to 3 cores, and for newer ones - // (like OnePlus 12) resolves to 4, which when benchamrked gives highest - // throughput. - auto num_of_cores = static_cast(num_of_perf_cores / 2) + 1; - ::executorch::extension::threadpool::get_threadpool() - ->_unsafe_reset_threadpool(num_of_cores); - log(LOG_LEVEL::Info, "Configuring xnnpack for ", num_of_cores, " threads"); -#endif } } // namespace rnexecutorch diff --git a/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h b/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h index 8b61080f8..50025eeeb 100644 --- a/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h +++ b/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -38,6 +39,9 @@ class GlobalThreadPool { numThreads, "threads"); instance = std::make_unique(numThreads.value(), config); + // Disable OpenCV's internal threading to prevent it from overriding our + // thread pool configuration, which would cause degraded performance + cv::setNumThreads(0); }); } From 270c85e93fb9dcdaa935dabcf3cbee11671ef327 Mon Sep 17 00:00:00 2001 From: Jakub Chmura <92989966+chmjkb@users.noreply.github.com> Date: Thu, 4 Dec 2025 13:35:42 +0100 Subject: [PATCH 08/11] chore: update HuggingFace model URL tags to v0.6 (#701) ## Description ### Introduces a breaking change? - [ ] Yes - [ ] No ### Type of change - [ ] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [ ] iOS - [ ] Android ### Testing instructions ### Screenshots ### Related issues ### Checklist - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [ ] My changes generate no new warnings ### Additional notes --- .../src/constants/modelUrls.ts | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index e9fe9e4d9..50e7ef5a8 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -2,8 +2,8 @@ import { Platform } from 'react-native'; const URL_PREFIX = 'https://huggingface.co/software-mansion/react-native-executorch'; -const VERSION_TAG = 'resolve/v0.5.0'; -const NEXT_VERSION_TAG = 'resolve/v0.6.0'; +const VERSION_TAG = 'resolve/v0.6.0'; +// const NEXT_VERSION_TAG = 'resolve/v0.7.0'; // LLMs @@ -307,32 +307,32 @@ export const STYLE_TRANSFER_UDNIE = { }; // S2T -const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_EN_ENCODER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_encoder_xnnpack.pte`; -const WHISPER_TINY_EN_DECODER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_decoder_xnnpack.pte`; +const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`; +const WHISPER_TINY_EN_ENCODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_encoder_xnnpack.pte`; +const WHISPER_TINY_EN_DECODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_decoder_xnnpack.pte`; -const WHISPER_TINY_EN_ENCODER_QUANTIZED = `${URL_PREFIX}-whisper-tiny-quantized.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_quantized_en_encoder_xnnpack.pte`; -const WHISPER_TINY_EN_DECODER_QUANTIZED = `${URL_PREFIX}-whisper-tiny-quantized.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_quantized_en_decoder_xnnpack.pte`; +const WHISPER_TINY_EN_ENCODER_QUANTIZED = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/xnnpack/whisper_tiny_quantized_en_encoder_xnnpack.pte`; +const WHISPER_TINY_EN_DECODER_QUANTIZED = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/xnnpack/whisper_tiny_quantized_en_decoder_xnnpack.pte`; -const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_EN_ENCODER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_encoder_xnnpack.pte`; -const WHISPER_BASE_EN_DECODER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_decoder_xnnpack.pte`; +const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`; +const WHISPER_BASE_EN_ENCODER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_encoder_xnnpack.pte`; +const WHISPER_BASE_EN_DECODER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_decoder_xnnpack.pte`; -const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_EN_ENCODER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_encoder_xnnpack.pte`; -const WHISPER_SMALL_EN_DECODER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_decoder_xnnpack.pte`; +const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_EN_ENCODER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_encoder_xnnpack.pte`; +const WHISPER_SMALL_EN_DECODER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_decoder_xnnpack.pte`; -const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_ENCODER_MODEL = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_encoder_xnnpack.pte`; -const WHISPER_TINY_DECODER_MODEL = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_decoder_xnnpack.pte`; +const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/tokenizer.json`; +const WHISPER_TINY_ENCODER_MODEL = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_encoder_xnnpack.pte`; +const WHISPER_TINY_DECODER_MODEL = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_decoder_xnnpack.pte`; -const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_ENCODER_MODEL = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_encoder_xnnpack.pte`; -const WHISPER_BASE_DECODER_MODEL = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_decoder_xnnpack.pte`; +const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/tokenizer.json`; +const WHISPER_BASE_ENCODER_MODEL = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_encoder_xnnpack.pte`; +const WHISPER_BASE_DECODER_MODEL = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_decoder_xnnpack.pte`; -const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_ENCODER_MODEL = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_encoder_xnnpack.pte`; -const WHISPER_SMALL_DECODER_MODEL = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_decoder_xnnpack.pte`; +const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_ENCODER_MODEL = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_encoder_xnnpack.pte`; +const WHISPER_SMALL_DECODER_MODEL = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_decoder_xnnpack.pte`; export const WHISPER_TINY_EN = { isMultilingual: false, @@ -452,7 +452,7 @@ export const BK_SDM_TINY_VPRED_256 = { }; // Voice Activity Detection -const FSMN_VAD_MODEL = `${URL_PREFIX}-fsmn-vad/${NEXT_VERSION_TAG}/xnnpack/fsmn-vad_xnnpack.pte`; +const FSMN_VAD_MODEL = `${URL_PREFIX}-fsmn-vad/${VERSION_TAG}/xnnpack/fsmn-vad_xnnpack.pte`; export const FSMN_VAD = { modelSource: FSMN_VAD_MODEL, From aa87474cc6135544a9f2862a23ee1e4435f56dc1 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Fri, 5 Dec 2025 08:11:52 +0100 Subject: [PATCH 09/11] update benchmarks for v0.6.0 --- .../useTextEmbeddings.md | 14 +- .../02-computer-vision/useClassification.md | 2 +- .../02-computer-vision/useImageEmbeddings.md | 6 +- .../02-hooks/02-computer-vision/useOCR.md | 16 +- .../02-computer-vision/useObjectDetection.md | 2 +- .../02-computer-vision/useStyleTransfer.md | 8 +- .../02-computer-vision/useVerticalOCR.md | 16 +- docs/docs/04-benchmarks/inference-time.md | 52 +- .../01-fundamentals/01-getting-started.md | 100 ++++ .../01-fundamentals/02-loading-models.md | 50 ++ .../03-frequently-asked-questions.md | 39 ++ .../01-fundamentals/_category_.json | 6 + .../_category_.json | 6 + .../01-natural-language-processing/useLLM.md | 537 ++++++++++++++++++ .../useSpeechToText.md | 343 +++++++++++ .../useTextEmbeddings.md | 158 ++++++ .../useTokenizer.md | 104 ++++ .../01-natural-language-processing/useVAD.md | 194 +++++++ .../02-computer-vision/_category_.json | 6 + .../02-computer-vision/useClassification.md | 113 ++++ .../02-computer-vision/useImageEmbeddings.md | 132 +++++ .../useImageSegmentation.md | 117 ++++ .../02-hooks/02-computer-vision/useOCR.md | 332 +++++++++++ .../02-computer-vision/useObjectDetection.md | 152 +++++ .../02-computer-vision/useStyleTransfer.md | 114 ++++ .../02-computer-vision/useTextToImage.md | 133 +++++ .../02-computer-vision/useVerticalOCR.md | 347 +++++++++++ .../03-executorch-bindings/_category_.json | 6 + .../useExecutorchModule.md | 155 +++++ .../version-0.6.x/02-hooks/_category_.json | 6 + .../LLMModule.md | 166 ++++++ .../SpeechToTextModule.md | 252 ++++++++ .../TextEmbeddingsModule.md | 59 ++ .../TokenizerModule.md | 60 ++ .../_category_.json | 6 + .../ClassificationModule.md | 64 +++ .../ImageEmbeddingsModule.md | 60 ++ .../ImageSegmentationModule.md | 77 +++ .../02-computer-vision/OCRModule.md | 135 +++++ .../ObjectDetectionModule.md | 77 +++ .../02-computer-vision/StyleTransferModule.md | 64 +++ .../02-computer-vision/VerticalOCRModule.md | 151 +++++ .../02-computer-vision/_category_.json | 6 + .../ExecutorchModule.md | 164 ++++++ .../03-executorch-bindings/_category_.json | 6 + .../03-typescript-api/_category_.json | 6 + .../04-benchmarks/_category_.json | 6 + .../04-benchmarks/inference-time.md | 111 ++++ .../04-benchmarks/memory-usage.md | 81 +++ .../version-0.6.x/04-benchmarks/model-size.md | 90 +++ .../05-utilities/_category_.json | 6 + .../05-utilities/resource-fetcher.md | 218 +++++++ 52 files changed, 5073 insertions(+), 58 deletions(-) create mode 100644 docs/versioned_docs/version-0.6.x/01-fundamentals/01-getting-started.md create mode 100644 docs/versioned_docs/version-0.6.x/01-fundamentals/02-loading-models.md create mode 100644 docs/versioned_docs/version-0.6.x/01-fundamentals/03-frequently-asked-questions.md create mode 100644 docs/versioned_docs/version-0.6.x/01-fundamentals/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/useLLM.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/useSpeechToText.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/useTextEmbeddings.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/useTokenizer.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/useVAD.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/02-computer-vision/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/02-computer-vision/useClassification.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/02-computer-vision/useImageEmbeddings.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/02-computer-vision/useImageSegmentation.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/02-computer-vision/useOCR.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/02-computer-vision/useObjectDetection.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/02-computer-vision/useStyleTransfer.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/02-computer-vision/useTextToImage.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/02-computer-vision/useVerticalOCR.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/03-executorch-bindings/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/03-executorch-bindings/useExecutorchModule.md create mode 100644 docs/versioned_docs/version-0.6.x/02-hooks/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/01-natural-language-processing/LLMModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/01-natural-language-processing/SpeechToTextModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/01-natural-language-processing/TextEmbeddingsModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/01-natural-language-processing/TokenizerModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/01-natural-language-processing/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/02-computer-vision/ClassificationModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/02-computer-vision/ImageEmbeddingsModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/02-computer-vision/ImageSegmentationModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/02-computer-vision/OCRModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/02-computer-vision/ObjectDetectionModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/02-computer-vision/StyleTransferModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/02-computer-vision/VerticalOCRModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/02-computer-vision/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/03-executorch-bindings/ExecutorchModule.md create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/03-executorch-bindings/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/03-typescript-api/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/04-benchmarks/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/04-benchmarks/inference-time.md create mode 100644 docs/versioned_docs/version-0.6.x/04-benchmarks/memory-usage.md create mode 100644 docs/versioned_docs/version-0.6.x/04-benchmarks/model-size.md create mode 100644 docs/versioned_docs/version-0.6.x/05-utilities/_category_.json create mode 100644 docs/versioned_docs/version-0.6.x/05-utilities/resource-fetcher.md diff --git a/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md b/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md index fd595d208..7d4706f15 100644 --- a/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md +++ b/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md @@ -145,13 +145,13 @@ For the supported models, the returned embedding vector is normalized, meaning t Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 16 | 16 | 19 | 54 | 28 | -| ALL_MPNET_BASE_V2 | 115 | 116 | 144 | 145 | 95 | -| MULTI_QA_MINILM_L6_COS_V1 | 16 | 16 | 20 | 47 | 28 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 112 | 119 | 144 | 146 | 96 | -| CLIP_VIT_BASE_PATCH32_TEXT | 47 | 45 | 57 | 65 | 48 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 7 | 21 | +| ALL_MPNET_BASE_V2 | 24 | 90 | +| MULTI_QA_MINILM_L6_COS_V1 | 7 | 19 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 24 | 88 | +| CLIP_VIT_BASE_PATCH32_TEXT | 14 | 39 | :::info Benchmark times for text embeddings are highly dependent on the sentence length. The numbers above are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. diff --git a/docs/docs/02-hooks/02-computer-vision/useClassification.md b/docs/docs/02-hooks/02-computer-vision/useClassification.md index e17bfa775..eaf9afcb7 100644 --- a/docs/docs/02-hooks/02-computer-vision/useClassification.md +++ b/docs/docs/02-hooks/02-computer-vision/useClassification.md @@ -110,4 +110,4 @@ Times presented in the tables are measured as consecutive runs of the model. Ini | Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 105 | 110 | 149 | 299 | 227 | +| EFFICIENTNET_V2_S | 64 | 68 | 217 | 205 | 198 | diff --git a/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md b/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md index 4d417590c..b6decd1d2 100644 --- a/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md +++ b/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md @@ -123,9 +123,9 @@ For the supported models, the returned embedding vector is normalized, meaning t Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. Performance also heavily depends on image size, because resize is expansive operation, especially on low-end devices. ::: -| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 70 | 70 | 90 | 66 | 58 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------------- | :--------------------------: | :-----------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 18 | 55 | :::info Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. diff --git a/docs/docs/02-hooks/02-computer-vision/useOCR.md b/docs/docs/02-hooks/02-computer-vision/useOCR.md index 08e28f829..d07efd601 100644 --- a/docs/docs/02-hooks/02-computer-vision/useOCR.md +++ b/docs/docs/02-hooks/02-computer-vision/useOCR.md @@ -319,14 +319,14 @@ Times presented in the tables are measured as consecutive runs of the model. Ini | Metric | iPhone 17 Pro
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | | ---------------------------------- | ------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | -| **Total Inference Time** | 1160 | 1144 | 1498 | 1567 | 1160 | -| **Detector (CRAFT_800_QUANTIZED)** | 669 | 649 | 825 | 541 | 474 | +| **Total Inference Time** | 652 | 600 | 2855 | 1092 | 1034 | +| **Detector (CRAFT_800_QUANTIZED)** | 220 | 221 | 1740 | 521 | 492 | | **Recognizer (CRNN_512)** | | | | | | -| ├─ Average Time | 48 | 47 | 60 | 91 | 72 | -| ├─ Total Time (3 runs) | 144 | 141 | 180 | 273 | 216 | +| ├─ Average Time | 45 | 38 | 110 | 40 | 38 | +| ├─ Total Time (3 runs) | 135 | 114 | 330 | 120 | 114 | | **Recognizer (CRNN_256)** | | | | | | -| ├─ Average Time | 22 | 22 | 29 | 51 | 30 | -| ├─ Total Time (7 runs) | 154 | 154 | 203 | 357 | 210 | +| ├─ Average Time | 21 | 18 | 54 | 20 | 19 | +| ├─ Total Time (7 runs) | 147 | 126 | 378 | 140 | 133 | | **Recognizer (CRNN_128)** | | | | | | -| ├─ Average Time | 11 | 11 | 14 | 28 | 17 | -| ├─ Total Time (7 runs) | 77 | 77 | 98 | 196 | 119 | +| ├─ Average Time | 11 | 9 | 27 | 10 | 10 | +| ├─ Total Time (7 runs) | 77 | 63 | 189 | 70 | 70 | diff --git a/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md b/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md index 7f49e8389..2bae6a658 100644 --- a/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md +++ b/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md @@ -149,4 +149,4 @@ Times presented in the tables are measured as consecutive runs of the model. Ini | Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 116 | 120 | 164 | 257 | 129 | +| SSDLITE_320_MOBILENET_V3_LARGE | 71 | 74 | 257 | 115 | 109 | diff --git a/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md b/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md index 2bedba325..f5d0a423c 100644 --- a/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md +++ b/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md @@ -108,7 +108,7 @@ Times presented in the tables are measured as consecutive runs of the model. Ini | Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 1356 | 1550 | 2003 | 2578 | 2328 | -| STYLE_TRANSFER_MOSAIC | 1376 | 1456 | 1971 | 2657 | 2394 | -| STYLE_TRANSFER_UDNIE | 1389 | 1499 | 1858 | 2380 | 2124 | -| STYLE_TRANSFER_RAIN_PRINCESS | 1339 | 1514 | 2004 | 2608 | 2371 | +| STYLE_TRANSFER_CANDY | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_MOSAIC | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_UDNIE | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1400 | 1485 | 4255 | 2510 | 2355 | diff --git a/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md b/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md index 94e5e3054..f317d527e 100644 --- a/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md +++ b/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md @@ -334,14 +334,14 @@ Times presented in the tables are measured as consecutive runs of the model. Ini | Metric | iPhone 17 Pro
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | | -------------------------------------------------------------------------- | ------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | -| **Total Inference Time** | 3819 / 3716 | 3978 / 3841 | 4751 / 4532 | 3095 / 3286 | 2787 / 2770 | -| **Detector (CRAFT_1280_QUANTIZED)** | 1749 | 1804 | 2105 | 1216 | 1171 | +| **Total Inference Time** | 1104 | 1113 | 8840 | 2845 | 2640 | +| **Detector (CRAFT_1280_QUANTIZED)** | 501 | 507 | 4317 | 1405 | 1275 | | **Detector (CRAFT_320_QUANTIZED)** | | | | | | -| ├─ Average Time | 458 | 474 | 561 | 360 | 332 | -| ├─ Total Time (4 runs) | 1832 | 1896 | 2244 | 1440 | 1328 | +| ├─ Average Time | 125 | 121 | 1060 | 338 | 299 | +| ├─ Total Time (4 runs) | 500 | 484 | 4240 | 1352 | 1196 | | **Recognizer (CRNN_64)**
(_With Flag `independentChars == true`_) | | | | | | -| ├─ Average Time | 5 | 6 | 7 | 28 | 11 | -| ├─ Total Time (21 runs) | 105 | 126 | 147 | 588 | 231 | +| ├─ Average Time | 5 | 6 | 14 | 7 | 6 | +| ├─ Total Time (21 runs) | 105 | 126 | 294 | 147 | 126 | | **Recognizer (CRNN_512)**
(_With Flag `independentChars == false`_) | | | | | | -| ├─ Average Time | 54 | 52 | 68 | 144 | 72 | -| ├─ Total Time (4 runs) | 216 | 208 | 272 | 576 | 288 | +| ├─ Average Time | 46 | 42 | 109 | 47 | 37 | +| ├─ Total Time (4 runs) | 184 | 168 | 436 | 188 | 148 | diff --git a/docs/docs/04-benchmarks/inference-time.md b/docs/docs/04-benchmarks/inference-time.md index 89f1f9de1..dbfc2b21d 100644 --- a/docs/docs/04-benchmarks/inference-time.md +++ b/docs/docs/04-benchmarks/inference-time.md @@ -10,22 +10,22 @@ Times presented in the tables are measured as consecutive runs of the model. Ini | Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 105 | 110 | 149 | 299 | 227 | +| EFFICIENTNET_V2_S | 64 | 68 | 217 | 205 | 198 | ## Object Detection | Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 116 | 120 | 164 | 257 | 129 | +| SSDLITE_320_MOBILENET_V3_LARGE | 71 | 74 | 257 | 115 | 109 | ## Style Transfer | Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 1356 | 1550 | 2003 | 2578 | 2328 | -| STYLE_TRANSFER_MOSAIC | 1376 | 1456 | 1971 | 2657 | 2394 | -| STYLE_TRANSFER_UDNIE | 1389 | 1499 | 1858 | 2380 | 2124 | -| STYLE_TRANSFER_RAIN_PRINCESS | 1339 | 1514 | 2004 | 2608 | 2371 | +| STYLE_TRANSFER_CANDY | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_MOSAIC | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_UDNIE | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1400 | 1485 | 4255 | 2510 | 2355 | ## OCR @@ -34,10 +34,10 @@ The values below represent the averages across all runs for the benchmark image. | Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Detector (CRAFT_800_QUANTIZED) | 669 | 649 | 825 | 541 | 474 | -| Recognizer (CRNN_512) | 48 | 47 | 60 | 91 | 72 | -| Recognizer (CRNN_256) | 22 | 22 | 29 | 51 | 30 | -| Recognizer (CRNN_128) | 11 | 11 | 14 | 28 | 17 | +| Detector (CRAFT_800_QUANTIZED) | 220 | 221 | 1740 | 521 | 492 | +| Recognizer (CRNN_512) | 45 | 38 | 110 | 40 | 38 | +| Recognizer (CRNN_256) | 21 | 18 | 54 | 20 | 19 | +| Recognizer (CRNN_128) | 11 | 9 | 27 | 10 | 10 | ## Vertical OCR @@ -46,10 +46,10 @@ The values below represent the averages across all runs for the benchmark image. | Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Detector (CRAFT_1280_QUANTIZED) | 1749 | 1804 | 2105 | 1216 | 1171 | -| Detector (CRAFT_320_QUANTIZED) | 458 | 474 | 561 | 360 | 332 | -| Recognizer (CRNN_512) | 54 | 52 | 68 | 144 | 72 | -| Recognizer (CRNN_64) | 5 | 6 | 7 | 28 | 11 | +| Detector (CRAFT_1280_QUANTIZED) | 501 | 507 | 4317 | 1405 | 1275 | +| Detector (CRAFT_320_QUANTIZED) | 125 | 121 | 1060 | 338 | 299 | +| Recognizer (CRNN_512) | 46 | 42 | 109 | 47 | 37 | +| Recognizer (CRNN_64) | 5 | 6 | 14 | 7 | 6 | ## LLMs @@ -70,7 +70,7 @@ Average time for encoding audio of given length over 10 runs. For `Whisper` mode | Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Whisper-tiny (30s) | 1391 | 1372 | 1894 | 1303 | 1214 | +| Whisper-tiny (30s) | 248 | 254 | 1145 | 435 | 526 | ### Decoding @@ -78,17 +78,17 @@ Average time for decoding one token in sequence of approximately 100 tokens, wit | Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Whisper-tiny (30s) | 53 | 53 | 74 | 100 | 84 | +| Whisper-tiny (30s) | 23 | 25 | 121 | 92 | 115 | ## Text Embeddings -| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 16 | 16 | 19 | 54 | 28 | -| ALL_MPNET_BASE_V2 | 115 | 116 | 144 | 145 | 95 | -| MULTI_QA_MINILM_L6_COS_V1 | 16 | 16 | 20 | 47 | 28 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 112 | 119 | 144 | 146 | 96 | -| CLIP_VIT_BASE_PATCH32_TEXT | 47 | 45 | 57 | 65 | 48 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 7 | 21 | +| ALL_MPNET_BASE_V2 | 24 | 90 | +| MULTI_QA_MINILM_L6_COS_V1 | 7 | 19 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 24 | 88 | +| CLIP_VIT_BASE_PATCH32_TEXT | 14 | 39 | :::info Benchmark times for text embeddings are highly dependent on the sentence length. The numbers above are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. @@ -96,9 +96,9 @@ Benchmark times for text embeddings are highly dependent on the sentence length. ## Image Embeddings -| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 70 | 70 | 90 | 66 | 58 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------------- | :--------------------------: | :-----------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 18 | 55 | :::info Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. diff --git a/docs/versioned_docs/version-0.6.x/01-fundamentals/01-getting-started.md b/docs/versioned_docs/version-0.6.x/01-fundamentals/01-getting-started.md new file mode 100644 index 000000000..b5d60c35b --- /dev/null +++ b/docs/versioned_docs/version-0.6.x/01-fundamentals/01-getting-started.md @@ -0,0 +1,100 @@ +--- +title: Getting Started +slug: / +keywords: + [ + react native, + react native ai, + react native llm, + react native qwen, + react native llama, + react native executorch, + executorch, + on-device ai, + pytorch, + mobile ai, + ] +description: 'Get started with React Native ExecuTorch - a framework for running AI models on-device in your React Native applications.' +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## What is ExecuTorch? + +ExecuTorch is a novel AI framework developed by Meta, designed to streamline deploying PyTorch models on a variety of devices, including mobile phones and microcontrollers. This framework enables exporting models into standalone binaries, allowing them to run locally without requiring API calls. ExecuTorch achieves state-of-the-art performance through optimizations and delegates such as Core ML and XNNPACK. It provides a seamless export process with robust debugging options, making it easier to resolve issues if they arise. + +## React Native ExecuTorch + +React Native ExecuTorch is our way of bringing ExecuTorch into the React Native world. Our API is built to be simple, declarative, and efficient. Plus, we’ll provide a set of pre-exported models for common use cases, so you won’t have to worry about handling exports yourself. With just a few lines of JavaScript, you’ll be able to run AI models (even LLMs 👀) right on your device—keeping user data private and saving on cloud costs. + +## Compatibility + +React Native Executorch supports only the [New React Native architecture](https://reactnative.dev/architecture/landing-page). + +If your app still runs on the old architecture, please consider upgrading to the New Architecture. + +## Installation + +Installation is pretty straightforward, just use your favorite package manager. + + + + + ``` + npm install react-native-executorch + ``` + + + + + ``` + pnpm install react-native-executorch + ``` + + + + + ``` + yarn add react-native-executorch + ``` + + + + +If you're using bare React Native (instead of a managed Expo project), you also need to install Expo Modules because the underlying implementation relies on expo-file-system. Since expo-file-system is an Expo package, bare React Native projects need **Expo Modules** to properly integrate and use it. The link provided (https://docs.expo.dev/bare/installing-expo-modules/) offers guidance on setting up Expo Modules in a bare React Native environment. + +If you plan on using your models via require() instead of fetching them from a url, you also need to add following lines to your `metro.config.js`: + +```json +// metro.config.js +... + defaultConfig.resolver.assetExts.push('pte') + defaultConfig.resolver.assetExts.push('bin') +... +``` + +This allows us to use binaries, such as exported models or tokenizers for LLMs. + +:::caution +When using Expo, please note that you need to use a custom development build of your app, not the standard Expo Go app. This is because we rely on native modules, which Expo Go doesn’t support. +::: + +:::info +Because we are using ExecuTorch under the hood, you won't be able to build iOS app for release with simulator selected as the target device. Make sure to test release builds on real devices. +::: + +Running the app with the library: + +```bash +yarn run expo: -d +``` + +## Good reads + +If you want to dive deeper into ExecuTorch or our previous work with the framework, we highly encourage you to check out the following resources: + +- [ExecuTorch docs](https://pytorch.org/executorch/stable/index.html) +- [Native code for iOS](https://medium.com/swmansion/bringing-native-ai-to-your-mobile-apps-with-executorch-part-i-ios-f1562a4556e8?source=user_profile_page---------0-------------250189c98ccf---------------) +- [Native code for Android](https://medium.com/swmansion/bringing-native-ai-to-your-mobile-apps-with-executorch-part-ii-android-29431b6b9f7f?source=user_profile_page---------2-------------b8e3a5cb1c63---------------) +- [Exporting to Android with XNNPACK](https://medium.com/swmansion/exporting-ai-models-on-android-with-xnnpack-and-executorch-3e70cff51c59?source=user_profile_page---------1-------------b8e3a5cb1c63---------------) diff --git a/docs/versioned_docs/version-0.6.x/01-fundamentals/02-loading-models.md b/docs/versioned_docs/version-0.6.x/01-fundamentals/02-loading-models.md new file mode 100644 index 000000000..8763d9614 --- /dev/null +++ b/docs/versioned_docs/version-0.6.x/01-fundamentals/02-loading-models.md @@ -0,0 +1,50 @@ +--- +title: Loading Models +--- + +There are three different methods available for loading model files, depending on their size and location. + +**1. Load from React Native assets folder (For Files < 512MB)** + +```typescript +useExecutorchModule({ + modelSource: require('../assets/llama3_2.pte'), +}); +``` + +**2. Load from remote URL:** + +For files larger than 512MB or when you want to keep size of the app smaller, you can load the model from a remote URL (e.g. HuggingFace). + +```typescript +useExecutorchModule({ + modelSource: 'https://.../llama3_2.pte', +}); +``` + +**3. Load from local file system:** + +If you prefer to delegate the process of obtaining and loading model and tokenizer files to the user, you can use the following method: + +```typescript +useExecutorchModule({ + modelSource: 'file:///var/mobile/.../llama3_2.pte', +}); +``` + +:::info +The downloaded files are stored in documents directory of your application. +::: + +## Example + +The following code snippet demonstrates how to load model and tokenizer files using `useLLM` hook: + +```typescript +import { useLLM } from 'react-native-executorch'; + +const llama = useLLM({ + modelSource: 'https://.../llama3_2.pte', + tokenizerSource: require('../assets/tokenizer.bin'), +}); +``` diff --git a/docs/versioned_docs/version-0.6.x/01-fundamentals/03-frequently-asked-questions.md b/docs/versioned_docs/version-0.6.x/01-fundamentals/03-frequently-asked-questions.md new file mode 100644 index 000000000..03914b25d --- /dev/null +++ b/docs/versioned_docs/version-0.6.x/01-fundamentals/03-frequently-asked-questions.md @@ -0,0 +1,39 @@ +--- +title: Frequently Asked Questions +--- + +This section is meant to answer some common community inquiries, especially regarding the ExecuTorch runtime or adding your own models. If you can't see an answer to your question, feel free to open up a [discussion](https://github.com/software-mansion/react-native-executorch/discussions/new/choose). + +### What models are supported? + +Each hook documentation subpage (useClassification, useLLM, etc.) contains a supported models section, which lists the models that are runnable within the library with close to no setup. For running your custom models, refer to `ExecuTorchModule` or `useExecuTorchModule`. + +### How can I run my own AI model? + +To run your own model, you need to directly access the underlying [ExecuTorch Module API](https://pytorch.org/executorch/stable/extension-module.html). We provide an experimental [React hook](../02-hooks/03-executorch-bindings/useExecutorchModule.md) along with a [TypeScript alternative](../03-typescript-api/03-executorch-bindings/ExecutorchModule.md), which serve as a way to use the aforementioned API without the need of diving into native code. In order to get a model in a format runnable by the runtime, you'll need to get your hands dirty with some ExecuTorch knowledge. For more guides on exporting models, please refer to the [ExecuTorch tutorials](https://pytorch.org/executorch/stable/tutorials/export-to-executorch-tutorial.html). Once you obtain your model in a `.pte` format, you can run it with `useExecuTorchModule` and `ExecuTorchModule`. + +### Can you do function calling with useLLM? + +If your model supports tool calling (i.e. its chat template can process tools) you can use the method explained on the [useLLM page](../02-hooks/01-natural-language-processing/useLLM.md). + +If your model doesn't support it, you can still work around it using context. For details, refer to [this comment](https://github.com/software-mansion/react-native-executorch/issues/173#issuecomment-2775082278). + +### Can I use React Native ExecuTorch in bare React Native apps? + +To use the library, you need to install Expo Modules first. For a setup guide, refer to [this tutorial](https://docs.expo.dev/bare/installing-expo-modules/). This is because we use Expo File System under the hood to download and manage the model binaries. + +### Do you support the old architecture? + +The old architecture is not supported and we're currently not planning to add support. + +### Can I run GGUF models using the library? + +No, as of now ExecuTorch runtime doesn't provide a reliable way to use GGUF models, hence it is not possible. + +### Are the models leveraging GPU acceleration? + +While it is possible to run some models using Core ML on iOS, which is a backend that utilizes CPU, GPU and ANE, we currently don't have many models exported to Core ML. For Android, the current state of GPU acceleration is pretty limited. As of now, there are attempts of running the models using a Vulkan backend. However the operator support is very limited meaning that the resulting performance is often inferior to XNNPACK. Hence, most of the models use XNNPACK, which is a highly optimized and mature CPU backend that runs on both Android and iOS. + +### Does this library support XNNPACK and Core ML? + +Yes, all of the backends are linked, therefore the only thing that needs to be done on your end is to export the model with the backend that you're interested in using. diff --git a/docs/versioned_docs/version-0.6.x/01-fundamentals/_category_.json b/docs/versioned_docs/version-0.6.x/01-fundamentals/_category_.json new file mode 100644 index 000000000..e3fddcbeb --- /dev/null +++ b/docs/versioned_docs/version-0.6.x/01-fundamentals/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Fundamentals", + "link": { + "type": "generated-index" + } +} diff --git a/docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/_category_.json b/docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/_category_.json new file mode 100644 index 000000000..0314f315d --- /dev/null +++ b/docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Natural Language Processing", + "link": { + "type": "generated-index" + } +} diff --git a/docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/useLLM.md b/docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/useLLM.md new file mode 100644 index 000000000..3f072f93c --- /dev/null +++ b/docs/versioned_docs/version-0.6.x/02-hooks/01-natural-language-processing/useLLM.md @@ -0,0 +1,537 @@ +--- +title: useLLM +keywords: + [ + react native, + react native ai, + react native llm, + react native qwen, + react native llama, + react native executorch, + executorch, + pytorch, + on-device ai, + mobile ai, + llama 3, + qwen, + text generation, + tool calling, + function calling, + ] +description: "Learn how to use LLMs in your React Native applications with React Native ExecuTorch's useLLM hook." +--- + +React Native ExecuTorch supports a variety of LLMs (checkout our [HuggingFace repository](https://huggingface.co/software-mansion) for model already converted to ExecuTorch format) including Llama 3.2. Before getting started, you’ll need to obtain the .pte binary—a serialized model, the tokenizer and tokenizer config JSON files. There are various ways to accomplish this: + +- For your convenience, it's best if you use models exported by us, you can get them from our [HuggingFace repository](https://huggingface.co/software-mansion). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library. +- Follow the official [tutorial](https://github.com/pytorch/executorch/blob/release/0.7/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md) made by ExecuTorch team to build the model and tokenizer yourself. + +:::danger +Lower-end devices might not be able to fit LLMs into memory. We recommend using quantized models to reduce the memory footprint. +::: + +## Initializing + +In order to load a model into the app, you need to run the following code: + +```typescript +import { useLLM, LLAMA3_2_1B } from 'react-native-executorch'; + +const llm = useLLM({ model: LLAMA3_2_1B }); +``` + +
+ +The code snippet above fetches the model from the specified URL, loads it into memory, and returns an object with various functions and properties for controlling the model. You can monitor the loading progress by checking the `llm.downloadProgress` and `llm.isReady` property, and if anything goes wrong, the `llm.error` property will contain the error message. + +### Arguments + +**`model`** - Object containing the model source, tokenizer source, and tokenizer config source. + +- **`modelSource`** - `ResourceSource` that specifies the location of the model binary. + +- **`tokenizerSource`** - `ResourceSource` pointing to the JSON file which contains the tokenizer. + +- **`tokenizerConfigSource`** - `ResourceSource` pointing to the JSON file which contains the tokenizer config. + +**`preventLoad?`** - Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook. + +For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page. + +### Returns + +| Field | Type | Description | +| ------------------------ | -------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `generate()` | `(messages: Message[], tools?: LLMTool[]) => Promise` | Runs model to complete chat passed in `messages` argument. It doesn't manage conversation context. | +| `interrupt()` | `() => void` | Function to interrupt the current inference. | +| `response` | `string` | State of the generated response. This field is updated with each token generated by the model. | +| `token` | `string` | The most recently generated token. | +| `isReady` | `boolean` | Indicates whether the model is ready. | +| `isGenerating` | `boolean` | Indicates whether the model is currently generating a response. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1, indicating the extent of the model file retrieval. | +| `error` | string | null | Contains the error message if the model failed to load. | +| `configure` | `({chatConfig?: Partial, toolsConfig?: ToolsConfig, generationConfig?: GenerationConfig}) => void` | Configures chat and tool calling. See more details in [configuring the model](#configuring-the-model). | +| `sendMessage` | `(message: string) => Promise` | Function to add user message to conversation. After model responds, `messageHistory` will be updated with both user message and model response. | +| `deleteMessage` | `(index: number) => void` | Deletes all messages starting with message on `index` position. After deletion `messageHistory` will be updated. | +| `messageHistory` | `Message[]` | History containing all messages in conversation. This field is updated after model responds to `sendMessage`. | +| `getGeneratedTokenCount` | `() => number` | Returns the number of tokens generated in the last response. | + +
+Type definitions + +```typescript +const useLLM: ({ + model, + preventLoad, +}: { + model: { + modelSource: ResourceSource; + tokenizerSource: ResourceSource; + tokenizerConfigSource: ResourceSource; + }; + preventLoad?: boolean; +}) => LLMType; + +interface LLMType { + messageHistory: Message[]; + response: string; + token: string; + isReady: boolean; + isGenerating: boolean; + downloadProgress: number; + error: string | null; + configure: ({ + chatConfig, + toolsConfig, + generationConfig, + }: { + chatConfig?: Partial; + toolsConfig?: ToolsConfig; + generationConfig?: GenerationConfig; + }) => void; + getGeneratedTokenCount: () => number; + generate: (messages: Message[], tools?: LLMTool[]) => Promise; + sendMessage: (message: string) => Promise; + deleteMessage: (index: number) => void; + interrupt: () => void; +} + +type ResourceSource = string | number | object; + +type MessageRole = 'user' | 'assistant' | 'system'; + +interface Message { + role: MessageRole; + content: string; +} +interface ChatConfig { + initialMessageHistory: Message[]; + contextWindowLength: number; + systemPrompt: string; +} + +interface GenerationConfig { + temperature?: number; + topp?: number; + outputTokenBatchSize?: number; + batchTimeInterval?: number; +} + +// tool calling +interface ToolsConfig { + tools: LLMTool[]; + executeToolCallback: (call: ToolCall) => Promise; + displayToolCalls?: boolean; +} + +interface ToolCall { + toolName: string; + arguments: Object; +} + +type LLMTool = Object; +``` + +
+ +## Functional vs managed + +You can use functions returned from this hooks in two manners: + +1. Functional/pure - we will not keep any state for you. You'll need to keep conversation history and handle function calling yourself. Use `generate` (and rarely `forward`) and `response`. Note that you don't need to run `configure` to use those. Furthermore, `chatConfig` and `toolsConfig` will not have any effect on those functions. + +2. Managed/stateful - we will manage conversation state. Tool calls will be parsed and called automatically after passing appropriate callbacks. See more at [managed LLM chat](#managed-llm-chat). + +## Functional way + +### Simple generation + +To perform chat completion you can use the `generate` function. There is no return value. Instead, the `response` value is updated with each token. + +```tsx +const llm = useLLM({ model: LLAMA3_2_1B }); + +const handleGenerate = () => { + const chat: Message[] = [ + { role: 'system', content: 'You are a helpful assistant' }, + { role: 'user', content: 'Hi!' }, + { role: 'assistant', content: 'Hi!, how can I help you?' }, + { role: 'user', content: 'What is the meaning of life?' }, + ]; + + // Chat completion + llm.generate(chat); +}; + +return ( + +