diff --git a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md index 8876bf37e..d94c96a66 100644 --- a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md @@ -75,20 +75,20 @@ For more information on loading resources, take a look at [loading models](../.. ### Returns -| Field | Type | Description | -| --------------------------- | ---------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `transcribe` | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | -| `stream` | `(options?: DecodingOptions \| undefined) => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | -| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | -| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | -| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | -| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | -| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | -| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | -| `error` | `string \| null` | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Tracks the progress of the model download process. | +| Field | Type | Description | +| --------------------------- | ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `transcribe` | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | +| `stream` | `(options?: DecodingOptions \| undefined) => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | +| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | +| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | +| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | +| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | +| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | +| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | +| `error` | `string \| null` | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Tracks the progress of the model download process. |
Type definitions @@ -340,4 +340,4 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------ | :--------------------: | :----------------: | -| WHISPER_TINY | 900 | 600 | +| WHISPER_TINY | 410 | 375 | diff --git a/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md b/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md index c40d19e94..7d4706f15 100644 --- a/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md +++ b/docs/docs/02-hooks/01-natural-language-processing/useTextEmbeddings.md @@ -133,11 +133,11 @@ For the supported models, the returned embedding vector is normalized, meaning t | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------- | :--------------------: | :----------------: | -| ALL_MINILM_L6_V2 | 85 | 100 | -| ALL_MPNET_BASE_V2 | 390 | 465 | -| MULTI_QA_MINILM_L6_COS_V1 | 115 | 130 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 415 | 490 | -| CLIP_VIT_BASE_PATCH32_TEXT | 195 | 250 | +| ALL_MINILM_L6_V2 | 95 | 110 | +| ALL_MPNET_BASE_V2 | 405 | 455 | +| MULTI_QA_MINILM_L6_COS_V1 | 120 | 140 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 435 | 455 | +| CLIP_VIT_BASE_PATCH32_TEXT | 200 | 280 | ### Inference time @@ -145,13 +145,13 @@ For the supported models, the returned embedding vector is normalized, meaning t Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 15 | 22 | 23 | 36 | 31 | -| ALL_MPNET_BASE_V2 | 71 | 96 | 101 | 112 | 105 | -| MULTI_QA_MINILM_L6_COS_V1 | 15 | 22 | 23 | 36 | 31 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 71 | 95 | 100 | 112 | 105 | -| CLIP_VIT_BASE_PATCH32_TEXT | 31 | 47 | 48 | 55 | 49 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 7 | 21 | +| ALL_MPNET_BASE_V2 | 24 | 90 | +| MULTI_QA_MINILM_L6_COS_V1 | 7 | 19 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 24 | 88 | +| CLIP_VIT_BASE_PATCH32_TEXT | 14 | 39 | :::info Benchmark times for text embeddings are highly dependent on the sentence length. The numbers above are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. diff --git a/docs/docs/02-hooks/02-computer-vision/useClassification.md b/docs/docs/02-hooks/02-computer-vision/useClassification.md index b4d3f34a6..eaf9afcb7 100644 --- a/docs/docs/02-hooks/02-computer-vision/useClassification.md +++ b/docs/docs/02-hooks/02-computer-vision/useClassification.md @@ -100,7 +100,7 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ----------------- | :--------------------: | :----------------: | -| EFFICIENTNET_V2_S | 130 | 85 | +| EFFICIENTNET_V2_S | 230 | 87 | ### Inference time @@ -108,6 +108,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 64 | 68 | 217 | 205 | 198 | diff --git a/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md b/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md index 6dbdc7dcc..b6decd1d2 100644 --- a/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md +++ b/docs/docs/02-hooks/02-computer-vision/useImageEmbeddings.md @@ -123,9 +123,9 @@ For the supported models, the returned embedding vector is normalized, meaning t Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. Performance also heavily depends on image size, because resize is expansive operation, especially on low-end devices. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 48 | 64 | 69 | 65 | 63 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------------- | :--------------------------: | :-----------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 18 | 55 | :::info Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. diff --git a/docs/docs/02-hooks/02-computer-vision/useOCR.md b/docs/docs/02-hooks/02-computer-vision/useOCR.md index 037daebf7..d07efd601 100644 --- a/docs/docs/02-hooks/02-computer-vision/useOCR.md +++ b/docs/docs/02-hooks/02-computer-vision/useOCR.md @@ -288,20 +288,20 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc ### Model size -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_800) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_256) | 16 - 18\* | -| Recognizer (CRNN_128) | 17 - 19\* | +| Model | XNNPACK [MB] | +| ------------------------------ | :----------: | +| Detector (CRAFT_800_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_256) | 16 - 18\* | +| Recognizer (CRNN_128) | 17 - 19\* | \* - The model weights vary depending on the language. ### Memory usage -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_800) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1600 | 1700 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------------------------------------------------------------------------ | :--------------------: | :----------------: | +| Detector (CRAFT_800_QUANTIZED) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1400 | 1320 | ### Inference time @@ -317,18 +317,16 @@ Times presented in the tables are measured as consecutive runs of the model. Ini **Time measurements:** -| Metric | iPhone 14 Pro Max
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | -| ------------------------- | ----------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | -| **Total Inference Time** | 4330 | 2537 | ❌ | 6648 | 5993 | -| **Detector (CRAFT_800)** | 1945 | 1809 | ❌ | 2080 | 1961 | -| **Recognizer (CRNN_512)** | | | | | | -| ├─ Average Time | 273 | 76 | ❌ | 289 | 252 | -| ├─ Total Time (3 runs) | 820 | 229 | ❌ | 867 | 756 | -| **Recognizer (CRNN_256)** | | | | | | -| ├─ Average Time | 137 | 39 | ❌ | 260 | 229 | -| ├─ Total Time (7 runs) | 958 | 271 | ❌ | 1818 | 1601 | -| **Recognizer (CRNN_128)** | | | | | | -| ├─ Average Time | 68 | 18 | ❌ | 239 | 214 | -| ├─ Total Time (7 runs) | 478 | 124 | ❌ | 1673 | 1498 | - -❌ - Insufficient RAM. +| Metric | iPhone 17 Pro
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | +| ---------------------------------- | ------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | +| **Total Inference Time** | 652 | 600 | 2855 | 1092 | 1034 | +| **Detector (CRAFT_800_QUANTIZED)** | 220 | 221 | 1740 | 521 | 492 | +| **Recognizer (CRNN_512)** | | | | | | +| ├─ Average Time | 45 | 38 | 110 | 40 | 38 | +| ├─ Total Time (3 runs) | 135 | 114 | 330 | 120 | 114 | +| **Recognizer (CRNN_256)** | | | | | | +| ├─ Average Time | 21 | 18 | 54 | 20 | 19 | +| ├─ Total Time (7 runs) | 147 | 126 | 378 | 140 | 133 | +| **Recognizer (CRNN_128)** | | | | | | +| ├─ Average Time | 11 | 9 | 27 | 10 | 10 | +| ├─ Total Time (7 runs) | 77 | 63 | 189 | 70 | 70 | diff --git a/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md b/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md index ac756d6a6..2bae6a658 100644 --- a/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md +++ b/docs/docs/02-hooks/02-computer-vision/useObjectDetection.md @@ -139,7 +139,7 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------------------------ | :--------------------: | :----------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 90 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 164 | 132 | ### Inference time @@ -147,6 +147,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 71 | 74 | 257 | 115 | 109 | diff --git a/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md b/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md index 899a619ca..f5d0a423c 100644 --- a/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md +++ b/docs/docs/02-hooks/02-computer-vision/useStyleTransfer.md @@ -95,10 +95,10 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ---------------------------- | :--------------------: | :----------------: | -| STYLE_TRANSFER_CANDY | 950 | 350 | -| STYLE_TRANSFER_MOSAIC | 950 | 350 | -| STYLE_TRANSFER_UDNIE | 950 | 350 | -| STYLE_TRANSFER_RAIN_PRINCESS | 950 | 350 | +| STYLE_TRANSFER_CANDY | 1200 | 380 | +| STYLE_TRANSFER_MOSAIC | 1200 | 380 | +| STYLE_TRANSFER_UDNIE | 1200 | 380 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1200 | 380 | ### Inference time @@ -106,9 +106,9 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_MOSAIC | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_UDNIE | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1400 | 1485 | 4255 | 2510 | 2355 | diff --git a/docs/docs/02-hooks/02-computer-vision/useTextToImage.md b/docs/docs/02-hooks/02-computer-vision/useTextToImage.md index 83e47a3e2..3eaf7d826 100644 --- a/docs/docs/02-hooks/02-computer-vision/useTextToImage.md +++ b/docs/docs/02-hooks/02-computer-vision/useTextToImage.md @@ -124,9 +124,9 @@ The number following the underscore (\_) indicates that the model supports gener ### Inference time -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :-------------------: | :-------------------------------: | :-----------------------: | -| BK_SDM_TINY_VPRED_256 | 19100 | 25000 | ❌ | ❌ | 23100 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| BK_SDM_TINY_VPRED_256 | 21184 | 21021 | ❌ | 18834 | 16617 | :::info Text-to-image benchmark times are measured generating 256×256 images in 10 inference steps. diff --git a/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md b/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md index 29a4de452..f317d527e 100644 --- a/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md +++ b/docs/docs/02-hooks/02-computer-vision/useVerticalOCR.md @@ -302,12 +302,12 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc ### Model size -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_1280) | 83.1 | -| Detector (CRAFT_320) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_64) | 15 - 16\* | +| Model | XNNPACK [MB] | +| ------------------------------- | :----------: | +| Detector (CRAFT_1280_QUANTIZED) | 19.8 | +| Detector (CRAFT_32_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_64) | 15 - 16\* | \* - The model weights vary depending on the language. @@ -315,8 +315,8 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 2172 | 2214 | -| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1774 | 1705 | +| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 1540 | 1470 | +| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1070 | 1000 | ### Inference time @@ -332,18 +332,16 @@ Times presented in the tables are measured as consecutive runs of the model. Ini **Time measurements:** -| Metric | iPhone 14 Pro Max
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | -| -------------------------------------------------------------------------- | ----------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | -| **Total Inference Time** | 9350 / 9620 | 8572 / 8621 | ❌ | 13737 / 10570 | 13436 / 9848 | -| **Detector (CRAFT_1250)** | 4895 | 4756 | ❌ | 5574 | 5016 | -| **Detector (CRAFT_320)** | | | | | | -| ├─ Average Time | 1247 | 1206 | ❌ | 1350 | 1356 | -| ├─ Total Time (3 runs) | 3741 | 3617 | ❌ | 4050 | 4069 | -| **Recognizer (CRNN_64)**
(_With Flag `independentChars == true`_) | | | | | | -| ├─ Average Time | 31 | 9 | ❌ | 195 | 207 | -| ├─ Total Time (21 runs) | 649 | 191 | ❌ | 4092 | 4339 | -| **Recognizer (CRNN_512)**
(_With Flag `independentChars == false`_) | | | | | | -| ├─ Average Time | 306 | 80 | ❌ | 308 | 250 | -| ├─ Total Time (3 runs) | 919 | 240 | ❌ | 925 | 751 | - -❌ - Insufficient RAM. +| Metric | iPhone 17 Pro
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | +| -------------------------------------------------------------------------- | ------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | +| **Total Inference Time** | 1104 | 1113 | 8840 | 2845 | 2640 | +| **Detector (CRAFT_1280_QUANTIZED)** | 501 | 507 | 4317 | 1405 | 1275 | +| **Detector (CRAFT_320_QUANTIZED)** | | | | | | +| ├─ Average Time | 125 | 121 | 1060 | 338 | 299 | +| ├─ Total Time (4 runs) | 500 | 484 | 4240 | 1352 | 1196 | +| **Recognizer (CRNN_64)**
(_With Flag `independentChars == true`_) | | | | | | +| ├─ Average Time | 5 | 6 | 14 | 7 | 6 | +| ├─ Total Time (21 runs) | 105 | 126 | 294 | 147 | 126 | +| **Recognizer (CRNN_512)**
(_With Flag `independentChars == false`_) | | | | | | +| ├─ Average Time | 46 | 42 | 109 | 47 | 37 | +| ├─ Total Time (4 runs) | 184 | 168 | 436 | 188 | 148 | diff --git a/docs/docs/04-benchmarks/inference-time.md b/docs/docs/04-benchmarks/inference-time.md index fa12ade94..dbfc2b21d 100644 --- a/docs/docs/04-benchmarks/inference-time.md +++ b/docs/docs/04-benchmarks/inference-time.md @@ -8,46 +8,48 @@ Times presented in the tables are measured as consecutive runs of the model. Ini ## Classification -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 64 | 68 | 217 | 205 | 198 | ## Object Detection -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 71 | 74 | 257 | 115 | 109 | ## Style Transfer -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_MOSAIC | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_UDNIE | 1400 | 1485 | 4255 | 2510 | 2355 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1400 | 1485 | 4255 | 2510 | 2355 | ## OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_800) | 2099 | 2227 | ❌ | 2245 | 7108 | -| Recognizer (CRNN_512) | 70 | 252 | ❌ | 54 | 151 | -| Recognizer (CRNN_256) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_128) | 17 | 83 | ❌ | 14 | 39 | +Notice that the recognizer models were executed between 3 and 7 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_800_QUANTIZED) | 220 | 221 | 1740 | 521 | 492 | +| Recognizer (CRNN_512) | 45 | 38 | 110 | 40 | 38 | +| Recognizer (CRNN_256) | 21 | 18 | 54 | 20 | 19 | +| Recognizer (CRNN_128) | 11 | 9 | 27 | 10 | 10 | ## Vertical OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_1280) | 5457 | 5833 | ❌ | 6296 | 14053 | -| Detector (CRAFT_320) | 1351 | 1460 | ❌ | 1485 | 3101 | -| Recognizer (CRNN_512) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_64) | 10 | 33 | ❌ | 7 | 18 | +Notice that the recognizer models, as well as detector CRAFT_320 model, were executed between 4 and 21 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_1280_QUANTIZED) | 501 | 507 | 4317 | 1405 | 1275 | +| Detector (CRAFT_320_QUANTIZED) | 125 | 121 | 1060 | 338 | 299 | +| Recognizer (CRNN_512) | 46 | 42 | 109 | 47 | 37 | +| Recognizer (CRNN_64) | 5 | 6 | 14 | 7 | 6 | ## LLMs @@ -62,41 +64,31 @@ Times presented in the tables are measured as consecutive runs of the model. Ini ❌ - Insufficient RAM. -## Streaming mode - -Notice than for `Whisper` model which has to take as an input 30 seconds audio chunks (for shorter audio it is automatically padded with silence to 30 seconds) `fast` mode has the lowest latency (time from starting transcription to first token returned, caused by streaming algorithm), but the slowest speed. If you believe that this might be a problem for you, prefer `balanced` mode instead. - -| Model (mode) | iPhone 16 Pro (XNNPACK) [latency \| tokens/s] | iPhone 14 Pro (XNNPACK) [latency \| tokens/s] | iPhone SE 3 (XNNPACK) [latency \| tokens/s] | Samsung Galaxy S24 (XNNPACK) [latency \| tokens/s] | OnePlus 12 (XNNPACK) [latency \| tokens/s] | -| ----------------------- | :-------------------------------------------: | :-------------------------------------------: | :-----------------------------------------: | :------------------------------------------------: | :----------------------------------------: | -| Whisper-tiny (fast) | 2.8s \| 5.5t/s | 3.7s \| 4.4t/s | 4.4s \| 3.4t/s | 5.5s \| 3.1t/s | 5.3s \| 3.8t/s | -| Whisper-tiny (balanced) | 5.6s \| 7.9t/s | 7.0s \| 6.3t/s | 8.3s \| 5.0t/s | 8.4s \| 6.7t/s | 7.7s \| 7.2t/s | -| Whisper-tiny (quality) | 10.3s \| 8.3t/s | 12.6s \| 6.8t/s | 7.8s \| 8.9t/s | 13.5s \| 7.1t/s | 12.9s \| 7.5t/s | - ### Encoding Average time for encoding audio of given length over 10 runs. For `Whisper` model we only list 30 sec audio chunks since `Whisper` does not accept other lengths (for shorter audio the audio needs to be padded to 30sec with silence). -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Whisper-tiny (30s) | 1034 | 1344 | 1269 | 2916 | 2143 | +| Whisper-tiny (30s) | 248 | 254 | 1145 | 435 | 526 | ### Decoding -Average time for decoding one token in sequence of 100 tokens, with encoding context is obtained from audio of noted length. +Average time for decoding one token in sequence of approximately 100 tokens, with encoding context is obtained from audio of noted length. -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Whisper-tiny (30s) | 128.03 | 113.65 | 141.63 | 89.08 | 84.49 | +| Whisper-tiny (30s) | 23 | 25 | 121 | 92 | 115 | ## Text Embeddings -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 15 | 22 | 23 | 36 | 31 | -| ALL_MPNET_BASE_V2 | 71 | 96 | 101 | 112 | 105 | -| MULTI_QA_MINILM_L6_COS_V1 | 15 | 22 | 23 | 36 | 31 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 71 | 95 | 100 | 112 | 105 | -| CLIP_VIT_BASE_PATCH32_TEXT | 31 | 47 | 48 | 55 | 49 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 7 | 21 | +| ALL_MPNET_BASE_V2 | 24 | 90 | +| MULTI_QA_MINILM_L6_COS_V1 | 7 | 19 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 24 | 88 | +| CLIP_VIT_BASE_PATCH32_TEXT | 14 | 39 | :::info Benchmark times for text embeddings are highly dependent on the sentence length. The numbers above are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. @@ -104,9 +96,9 @@ Benchmark times for text embeddings are highly dependent on the sentence length. ## Image Embeddings -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 48 | 64 | 69 | 65 | 63 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------------- | :--------------------------: | :-----------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 18 | 55 | :::info Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. @@ -114,18 +106,6 @@ Image embedding benchmark times are measured using 224×224 pixel images, as req ## Text to Image -Average time for generating one image of size 256×256 in 10 inference steps. - -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :-------------------: | :-------------------------------: | :-----------------------: | -| BK_SDM_TINY_VPRED_256 | 19100 | 25000 | ❌ | ❌ | 23100 | - -## Voice Activity Detection (VAD) - -Average time for processing 60s audio. - - - -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| -------- | :--------------------------: | :------------------------------: | :------------------------: | :-----------------------: | -| FSMN_VAD | 151 | 171 | 180 | 109 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| BK_SDM_TINY_VPRED_256 | 21184 | 21021 | ❌ | 18834 | 16617 | diff --git a/docs/docs/04-benchmarks/memory-usage.md b/docs/docs/04-benchmarks/memory-usage.md index e250b0c90..a0c5a7b6d 100644 --- a/docs/docs/04-benchmarks/memory-usage.md +++ b/docs/docs/04-benchmarks/memory-usage.md @@ -2,82 +2,80 @@ title: Memory Usage --- +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 (Android). +::: + ## Classification | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ----------------- | :--------------------: | :----------------: | -| EFFICIENTNET_V2_S | 130 | 85 | +| EFFICIENTNET_V2_S | 230 | 87 | ## Object Detection | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------------------------ | :--------------------: | :----------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 90 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 164 | 132 | ## Style Transfer | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ---------------------------- | :--------------------: | :----------------: | -| STYLE_TRANSFER_CANDY | 950 | 350 | -| STYLE_TRANSFER_MOSAIC | 950 | 350 | -| STYLE_TRANSFER_UDNIE | 950 | 350 | -| STYLE_TRANSFER_RAIN_PRINCESS | 950 | 350 | +| STYLE_TRANSFER_CANDY | 1200 | 380 | +| STYLE_TRANSFER_MOSAIC | 1200 | 380 | +| STYLE_TRANSFER_UDNIE | 1200 | 380 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1200 | 380 | ## OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_800) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 2100 | 1782 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------------------------------------------------------------------------ | :--------------------: | :----------------: | +| Detector (CRAFT_800_QUANTIZED) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1400 | 1320 | ## Vertical OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 2770 | 3720 | -| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1770 | 2740 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ---------------------------------------------------------------------------------------- | :--------------------: | :----------------: | +| Detector (CRAFT_1280_QUANTIZED) + Detector (CRAFT_320_QUANTIZED) + Recognizer (CRNN_512) | 1540 | 1470 | +| Detector(CRAFT_1280_QUANTIZED) + Detector(CRAFT_320_QUANTIZED) + Recognizer (CRNN_64) | 1070 | 1000 | ## LLMs | Model | Android (XNNPACK) [GB] | iOS (XNNPACK) [GB] | | --------------------- | :--------------------: | :----------------: | -| LLAMA3_2_1B | 3.2 | 3.1 | -| LLAMA3_2_1B_SPINQUANT | 1.9 | 2 | -| LLAMA3_2_1B_QLORA | 2.2 | 2.5 | +| LLAMA3_2_1B | 3.3 | 3.1 | +| LLAMA3_2_1B_SPINQUANT | 1.9 | 2.4 | +| LLAMA3_2_1B_QLORA | 2.7 | 2.8 | | LLAMA3_2_3B | 7.1 | 7.3 | | LLAMA3_2_3B_SPINQUANT | 3.7 | 3.8 | -| LLAMA3_2_3B_QLORA | 4 | 4.1 | +| LLAMA3_2_3B_QLORA | 3.9 | 4.0 | ## Speech to text | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------ | :--------------------: | :----------------: | -| WHISPER_TINY | 900 | 600 | +| WHISPER_TINY | 410 | 375 | ## Text Embeddings | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------- | :--------------------: | :----------------: | -| ALL_MINILM_L6_V2 | 85 | 100 | -| ALL_MPNET_BASE_V2 | 390 | 465 | -| MULTI_QA_MINILM_L6_COS_V1 | 115 | 130 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 415 | 490 | -| CLIP_VIT_BASE_PATCH32_TEXT | 195 | 250 | +| ALL_MINILM_L6_V2 | 95 | 110 | +| ALL_MPNET_BASE_V2 | 405 | 455 | +| MULTI_QA_MINILM_L6_COS_V1 | 120 | 140 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 435 | 455 | +| CLIP_VIT_BASE_PATCH32_TEXT | 200 | 280 | ## Image Embeddings | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | --------------------------- | :--------------------: | :----------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 350 | 340 | +| CLIP_VIT_BASE_PATCH32_IMAGE | 345 | 340 | ## Text to Image | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | --------------------- | ---------------------- | ------------------ | -| BK_SDM_TINY_VPRED_256 | 2900 | 2800 | -| BK_SDM_TINY_VPRED | 6700 | 6560 | - -## Voice Activity Detection (VAD) - -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------- | :--------------------: | :----------------: | -| FSMN_VAD | 97 | 45,9 | +| BK_SDM_TINY_VPRED_256 | 2400 | 2400 | +| BK_SDM_TINY_VPRED | 6210 | 6050 | diff --git a/docs/docs/04-benchmarks/model-size.md b/docs/docs/04-benchmarks/model-size.md index 2a648ac53..00e819494 100644 --- a/docs/docs/04-benchmarks/model-size.md +++ b/docs/docs/04-benchmarks/model-size.md @@ -25,23 +25,23 @@ title: Model Size ## OCR -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_800) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_256) | 16 - 18\* | -| Recognizer (CRNN_128) | 17 - 19\* | +| Model | XNNPACK [MB] | +| ------------------------------ | :----------: | +| Detector (CRAFT_800_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_256) | 16 - 18\* | +| Recognizer (CRNN_128) | 17 - 19\* | \* - The model weights vary depending on the language. ## Vertical OCR -| Model | XNNPACK [MB] | -| ------------------------ | :----------: | -| Detector (CRAFT_1280) | 83.1 | -| Detector (CRAFT_320) | 83.1 | -| Recognizer (CRNN_EN_512) | 15 - 18\* | -| Recognizer (CRNN_EN_64) | 15 - 16\* | +| Model | XNNPACK [MB] | +| ------------------------------- | :----------: | +| Detector (CRAFT_1280_QUANTIZED) | 19.8 | +| Detector (CRAFT_320_QUANTIZED) | 19.8 | +| Recognizer (CRNN_EN_512) | 15 - 18\* | +| Recognizer (CRNN_EN_64) | 15 - 16\* | \* - The model weights vary depending on the language. diff --git a/docs/versioned_docs/version-0.4.x/benchmarks/inference-time.md b/docs/versioned_docs/version-0.4.x/benchmarks/inference-time.md index da35e7b6e..f5d6d0113 100644 --- a/docs/versioned_docs/version-0.4.x/benchmarks/inference-time.md +++ b/docs/versioned_docs/version-0.4.x/benchmarks/inference-time.md @@ -8,50 +8,52 @@ Times presented in the tables are measured as consecutive runs of the model. Ini ## Classification -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 150 | 161 | 227 | 196 | 214 | ## Object Detection -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 261 | 279 | 414 | 125 | 115 | ## Style Transfer -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_MOSAIC | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_UDNIE | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1565 | 1675 | 2325 | 1750 | 1620 | ## OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_800) | 2099 | 2227 | ❌ | 2245 | 7108 | -| Recognizer (CRNN_512) | 70 | 252 | ❌ | 54 | 151 | -| Recognizer (CRNN_256) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_128) | 17 | 83 | ❌ | 14 | 39 | +Notice that the recognizer models were executed between 3 and 7 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_800_QUANTIZED) | 779 | 897 | 1276 | 553 | 586 | +| Recognizer (CRNN_512) | 77 | 74 | 244 | 56 | 57 | +| Recognizer (CRNN_256) | 35 | 37 | 120 | 28 | 30 | +| Recognizer (CRNN_128) | 18 | 19 | 60 | 14 | 16 | ## Vertical OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_1280) | 5457 | 5833 | ❌ | 6296 | 14053 | -| Detector (CRAFT_320) | 1351 | 1460 | ❌ | 1485 | 3101 | -| Recognizer (CRNN_512) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_64) | 10 | 33 | ❌ | 7 | 18 | +Notice that the recognizer models, as well as detector CRAFT_320 model, were executed between 4 and 21 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_1280_QUANTIZED) | 1918 | 2304 | 3371 | 1391 | 1445 | +| Detector (CRAFT_320_QUANTIZED) | 473 | 563 | 813 | 361 | 382 | +| Recognizer (CRNN_512) | 78 | 83 | 310 | 59 | 57 | +| Recognizer (CRNN_64) | 9 | 9 | 38 | 8 | 7 | ## LLMs -| Model | iPhone 16 Pro (XNNPACK) [tokens/s] | iPhone 13 Pro (XNNPACK) [tokens/s] | iPhone SE 3 (XNNPACK) [tokens/s] | Samsung Galaxy S24 (XNNPACK) [tokens/s] | OnePlus 12 (XNNPACK) [tokens/s] | +| Model | iPhone 17 Pro (XNNPACK) [tokens/s] | iPhone 16 Pro (XNNPACK) [tokens/s] | iPhone SE 3 (XNNPACK) [tokens/s] | Samsung Galaxy S24 (XNNPACK) [tokens/s] | OnePlus 12 (XNNPACK) [tokens/s] | | --------------------- | :--------------------------------: | :--------------------------------: | :------------------------------: | :-------------------------------------: | :-----------------------------: | | LLAMA3_2_1B | 16.1 | 11.4 | ❌ | 15.6 | 19.3 | | LLAMA3_2_1B_SPINQUANT | 40.6 | 16.7 | 16.5 | 40.3 | 48.2 | @@ -68,7 +70,7 @@ Times presented in the tables are measured as consecutive runs of the model. Ini Notice than for `Whisper` model which has to take as an input 30 seconds audio chunks (for shorter audio it is automatically padded with silence to 30 seconds) `fast` mode has the lowest latency (time from starting transcription to first token returned, caused by streaming algorithm), but the slowest speed. That's why for the lowest latency and the fastest transcription we suggest using `Moonshine` model, if you still want to proceed with `Whisper` use preferably the `balanced` mode. -| Model (mode) | iPhone 16 Pro (XNNPACK) [latency \| tokens/s] | iPhone 14 Pro (XNNPACK) [latency \| tokens/s] | iPhone SE 3 (XNNPACK) [latency \| tokens/s] | Samsung Galaxy S24 (XNNPACK) [latency \| tokens/s] | OnePlus 12 (XNNPACK) [latency \| tokens/s] | +| Model (mode) | iPhone 17 Pro (XNNPACK) [latency \| tokens/s] | iPhone 16 Pro (XNNPACK) [latency \| tokens/s] | iPhone SE 3 (XNNPACK) [latency \| tokens/s] | Samsung Galaxy S24 (XNNPACK) [latency \| tokens/s] | OnePlus 12 (XNNPACK) [latency \| tokens/s] | | ------------------------- | :-------------------------------------------: | :-------------------------------------------: | :-----------------------------------------: | :------------------------------------------------: | :----------------------------------------: | | Moonshine-tiny (fast) | 0.8s \| 19.0t/s | 1.5s \| 11.3t/s | 1.5s \| 10.4t/s | 2.0s \| 8.8t/s | 1.6s \| 12.5t/s | | Moonshine-tiny (balanced) | 2.0s \| 20.0t/s | 3.2s \| 12.4t/s | 3.7s \| 10.4t/s | 4.6s \| 11.2t/s | 3.4s \| 14.6t/s | @@ -81,7 +83,7 @@ Notice than for `Whisper` model which has to take as an input 30 seconds audio c Average time for encoding audio of given length over 10 runs. For `Whisper` model we only list 30 sec audio chunks since `Whisper` does not accept other lengths (for shorter audio the audio needs to be padded to 30sec with silence). -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | -------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | | Moonshine-tiny (5s) | 99 | 95 | 115 | 284 | 277 | | Moonshine-tiny (10s) | 178 | 177 | 204 | 555 | 528 | @@ -92,7 +94,7 @@ Average time for encoding audio of given length over 10 runs. For `Whisper` mode Average time for decoding one token in sequence of 100 tokens, with encoding context is obtained from audio of noted length. -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | -------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | | Moonshine-tiny (5s) | 48.98 | 47.98 | 46.86 | 36.70 | 29.03 | | Moonshine-tiny (10s) | 54.24 | 51.74 | 55.07 | 46.31 | 32.41 | @@ -101,9 +103,9 @@ Average time for decoding one token in sequence of 100 tokens, with encoding con ## Text Embeddings -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 53 | 69 | 78 | 60 | 65 | -| ALL_MPNET_BASE_V2 | 352 | 423 | 478 | 521 | 527 | -| MULTI_QA_MINILM_L6_COS_V1 | 135 | 166 | 180 | 158 | 165 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 503 | 598 | 680 | 694 | 743 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 50 | 58 | 84 | 58 | 58 | +| ALL_MPNET_BASE_V2 | 352 | 428 | 879 | 483 | 517 | +| MULTI_QA_MINILM_L6_COS_V1 | 133 | 161 | 269 | 151 | 155 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 502 | 796 | 1216 | 915 | 713 | diff --git a/docs/versioned_docs/version-0.4.x/benchmarks/memory-usage.md b/docs/versioned_docs/version-0.4.x/benchmarks/memory-usage.md index 862ffd574..25298f630 100644 --- a/docs/versioned_docs/version-0.4.x/benchmarks/memory-usage.md +++ b/docs/versioned_docs/version-0.4.x/benchmarks/memory-usage.md @@ -25,16 +25,16 @@ title: Memory Usage ## OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_800) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 2100 | 1782 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------------------------------------------------------------------------ | :--------------------: | :----------------: | +| Detector (CRAFT_800_QUANTIZED) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1400 | 1320 | ## Vertical OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 2770 | 3720 | -| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1770 | 2740 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ---------------------------------------------------------------------------------------- | :--------------------: | :----------------: | +| Detector (CRAFT_1280_QUANTIZED) + Detector (CRAFT_320_QUANTIZED) + Recognizer (CRNN_512) | 1540 | 1470 | +| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1070 | 1000 | ## LLMs diff --git a/docs/versioned_docs/version-0.4.x/benchmarks/model-size.md b/docs/versioned_docs/version-0.4.x/benchmarks/model-size.md index f39fa2f14..d5e890120 100644 --- a/docs/versioned_docs/version-0.4.x/benchmarks/model-size.md +++ b/docs/versioned_docs/version-0.4.x/benchmarks/model-size.md @@ -25,23 +25,23 @@ title: Model Size ## OCR -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_800) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_256) | 16 - 18\* | -| Recognizer (CRNN_128) | 17 - 19\* | +| Model | XNNPACK [MB] | +| ------------------------------ | :----------: | +| Detector (CRAFT_800_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_256) | 16 - 18\* | +| Recognizer (CRNN_128) | 17 - 19\* | \* - The model weights vary depending on the language. ## Vertical OCR -| Model | XNNPACK [MB] | -| ------------------------ | :----------: | -| Detector (CRAFT_1280) | 83.1 | -| Detector (CRAFT_320) | 83.1 | -| Recognizer (CRNN_EN_512) | 15 - 18\* | -| Recognizer (CRNN_EN_64) | 15 - 16\* | +| Model | XNNPACK [MB] | +| ------------------------------- | :----------: | +| Detector (CRAFT_1280_QUANTIZED) | 19.8 | +| Detector (CRAFT_320_QUANTIZED) | 19.8 | +| Recognizer (CRNN_EN_512) | 15 - 18\* | +| Recognizer (CRNN_EN_64) | 15 - 16\* | \* - The model weights vary depending on the language. diff --git a/docs/versioned_docs/version-0.4.x/computer-vision/useClassification.md b/docs/versioned_docs/version-0.4.x/computer-vision/useClassification.md index fb812fb57..caef31b3d 100644 --- a/docs/versioned_docs/version-0.4.x/computer-vision/useClassification.md +++ b/docs/versioned_docs/version-0.4.x/computer-vision/useClassification.md @@ -85,8 +85,8 @@ function App() { ## Supported models -| Model | Number of classes | Class list | -| --------------------------------------------------------------------------------------------------------------- | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Model | Number of classes | Class list | +| ----------------------------------------------------------------------------------------------------------------- | ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | [efficientnet_v2_s](https://pytorch.org/vision/stable/models/generated/torchvision.models.efficientnet_v2_s.html) | 1000 | [ImageNet1k_v1](https://github.com/software-mansion/react-native-executorch/blob/release/0.4/android/src/main/java/com/swmansion/rnexecutorch/models/classification/Constants.kt) | ## Benchmarks @@ -109,6 +109,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 150 | 161 | 227 | 196 | 214 | diff --git a/docs/versioned_docs/version-0.4.x/computer-vision/useOCR.md b/docs/versioned_docs/version-0.4.x/computer-vision/useOCR.md index 2c12300e8..960815719 100644 --- a/docs/versioned_docs/version-0.4.x/computer-vision/useOCR.md +++ b/docs/versioned_docs/version-0.4.x/computer-vision/useOCR.md @@ -321,11 +321,9 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_800) | 2099 | 2227 | ❌ | 2245 | 7108 | -| Recognizer (CRNN_512) | 70 | 252 | ❌ | 54 | 151 | -| Recognizer (CRNN_256) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_128) | 17 | 83 | ❌ | 14 | 39 | - -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_800_QUANTIZED) | 779 | 897 | 1276 | 553 | 586 | +| Recognizer (CRNN_512) | 77 | 74 | 244 | 56 | 57 | +| Recognizer (CRNN_256) | 35 | 37 | 120 | 28 | 30 | +| Recognizer (CRNN_128) | 18 | 19 | 60 | 14 | 16 | diff --git a/docs/versioned_docs/version-0.4.x/computer-vision/useObjectDetection.md b/docs/versioned_docs/version-0.4.x/computer-vision/useObjectDetection.md index b18faa8f8..0bdbeef01 100644 --- a/docs/versioned_docs/version-0.4.x/computer-vision/useObjectDetection.md +++ b/docs/versioned_docs/version-0.4.x/computer-vision/useObjectDetection.md @@ -145,6 +145,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 261 | 279 | 414 | 125 | 115 | diff --git a/docs/versioned_docs/version-0.4.x/computer-vision/useStyleTransfer.md b/docs/versioned_docs/version-0.4.x/computer-vision/useStyleTransfer.md index 40f30a1d0..09599bac7 100644 --- a/docs/versioned_docs/version-0.4.x/computer-vision/useStyleTransfer.md +++ b/docs/versioned_docs/version-0.4.x/computer-vision/useStyleTransfer.md @@ -107,9 +107,9 @@ function App(){ Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_MOSAIC | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_UDNIE | 1565 | 1675 | 2325 | 1750 | 1620 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1565 | 1675 | 2325 | 1750 | 1620 | diff --git a/docs/versioned_docs/version-0.4.x/computer-vision/useVerticalOCR.md b/docs/versioned_docs/version-0.4.x/computer-vision/useVerticalOCR.md index 98cc301bf..ce9b456e3 100644 --- a/docs/versioned_docs/version-0.4.x/computer-vision/useVerticalOCR.md +++ b/docs/versioned_docs/version-0.4.x/computer-vision/useVerticalOCR.md @@ -342,11 +342,9 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_1280) | 5457 | 5833 | ❌ | 6296 | 14053 | -| Detector (CRAFT_320) | 1351 | 1460 | ❌ | 1485 | 3101 | -| Recognizer (CRNN_512) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_64) | 10 | 33 | ❌ | 7 | 18 | - -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_1280_QUANTIZED) | 1918 | 2304 | 3371 | 1391 | 1445 | +| Detector (CRAFT_320_QUANTIZED) | 473 | 563 | 813 | 361 | 382 | +| Recognizer (CRNN_512) | 78 | 83 | 310 | 59 | 57 | +| Recognizer (CRNN_64) | 9 | 9 | 38 | 8 | 7 | diff --git a/docs/versioned_docs/version-0.4.x/natural-language-processing/useTextEmbeddings.md b/docs/versioned_docs/version-0.4.x/natural-language-processing/useTextEmbeddings.md index 43fefe3d6..5aeeaa02b 100644 --- a/docs/versioned_docs/version-0.4.x/natural-language-processing/useTextEmbeddings.md +++ b/docs/versioned_docs/version-0.4.x/natural-language-processing/useTextEmbeddings.md @@ -148,9 +148,9 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 53 | 69 | 78 | 60 | 65 | -| ALL_MPNET_BASE_V2 | 352 | 423 | 478 | 521 | 527 | -| MULTI_QA_MINILM_L6_COS_V1 | 135 | 166 | 180 | 158 | 165 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 503 | 598 | 680 | 694 | 743 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 50 | 58 | 84 | 58 | 58 | +| ALL_MPNET_BASE_V2 | 352 | 428 | 879 | 483 | 517 | +| MULTI_QA_MINILM_L6_COS_V1 | 133 | 161 | 269 | 151 | 155 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 502 | 796 | 1216 | 915 | 713 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useSpeechToText.md b/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useSpeechToText.md index 3256e2e88..d94c96a66 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useSpeechToText.md @@ -75,20 +75,20 @@ For more information on loading resources, take a look at [loading models](../.. ### Returns -| Field | Type | Description | -| --------------------------- | ---------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `transcribe` | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | -| `stream` | `() => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | -| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | -| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | -| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | -| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | -| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | -| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | -| `error` | `string \| null` | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Tracks the progress of the model download process. | +| Field | Type | Description | +| --------------------------- | ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `transcribe` | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. | +| `stream` | `(options?: DecodingOptions \| undefined) => Promise` | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses. | +| `streamInsert` | `(waveform: Float32Array \| number[]) => void` | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated. | +| `streamStop` | `() => void` | Stops the ongoing streaming transcription process. | +| `encode` | `(waveform: Float32Array \| number[]) => Promise` | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated. | +| `decode` | `(tokens: number[] \| Int32Array, encoderOutput: Float32Array \| number[]) => Promise` | Runs the decoder of the model. Passing `number[]` is deprecated. | +| `committedTranscription` | `string` | Contains the part of the transcription that is finalized and will not change. Useful for displaying stable results during streaming. | +| `nonCommittedTranscription` | `string` | Contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming. | +| `error` | `string \| null` | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Tracks the progress of the model download process. |
Type definitions @@ -340,4 +340,4 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------ | :--------------------: | :----------------: | -| WHISPER_TINY | 900 | 600 | +| WHISPER_TINY | 410 | 375 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useTextEmbeddings.md b/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useTextEmbeddings.md index c40d19e94..fd595d208 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useTextEmbeddings.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useTextEmbeddings.md @@ -133,11 +133,11 @@ For the supported models, the returned embedding vector is normalized, meaning t | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------- | :--------------------: | :----------------: | -| ALL_MINILM_L6_V2 | 85 | 100 | -| ALL_MPNET_BASE_V2 | 390 | 465 | -| MULTI_QA_MINILM_L6_COS_V1 | 115 | 130 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 415 | 490 | -| CLIP_VIT_BASE_PATCH32_TEXT | 195 | 250 | +| ALL_MINILM_L6_V2 | 95 | 110 | +| ALL_MPNET_BASE_V2 | 405 | 455 | +| MULTI_QA_MINILM_L6_COS_V1 | 120 | 140 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 435 | 455 | +| CLIP_VIT_BASE_PATCH32_TEXT | 200 | 280 | ### Inference time @@ -145,13 +145,13 @@ For the supported models, the returned embedding vector is normalized, meaning t Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 15 | 22 | 23 | 36 | 31 | -| ALL_MPNET_BASE_V2 | 71 | 96 | 101 | 112 | 105 | -| MULTI_QA_MINILM_L6_COS_V1 | 15 | 22 | 23 | 36 | 31 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 71 | 95 | 100 | 112 | 105 | -| CLIP_VIT_BASE_PATCH32_TEXT | 31 | 47 | 48 | 55 | 49 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 16 | 16 | 19 | 54 | 28 | +| ALL_MPNET_BASE_V2 | 115 | 116 | 144 | 145 | 95 | +| MULTI_QA_MINILM_L6_COS_V1 | 16 | 16 | 20 | 47 | 28 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 112 | 119 | 144 | 146 | 96 | +| CLIP_VIT_BASE_PATCH32_TEXT | 47 | 45 | 57 | 65 | 48 | :::info Benchmark times for text embeddings are highly dependent on the sentence length. The numbers above are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useClassification.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useClassification.md index b4d3f34a6..e17bfa775 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useClassification.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useClassification.md @@ -100,7 +100,7 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ----------------- | :--------------------: | :----------------: | -| EFFICIENTNET_V2_S | 130 | 85 | +| EFFICIENTNET_V2_S | 230 | 87 | ### Inference time @@ -108,6 +108,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 105 | 110 | 149 | 299 | 227 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useImageEmbeddings.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useImageEmbeddings.md index 1849a95ce..4d417590c 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useImageEmbeddings.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useImageEmbeddings.md @@ -91,9 +91,9 @@ try { ## Supported models -| Model | Language | Image size | Embedding Dimensions | Description | +| Model | Language | Image size | Embedding dimensions | Description | | ---------------------------------------------------------------------------------- | :------: | :--------: | :------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [clip-vit-base-patch32-image](https://huggingface.co/openai/clip-vit-base-patch32) | English | 224 x 224 | 512 | CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. CLIP allows to embed images and text into the same vector space. This allows to find similar images as well as to implement image search. This is the image encoder part of the CLIP model. To embed text checkout [clip-vit-base-patch32-text](../01-natural-language-processing/useTextEmbeddings.md#supported-models). | +| [clip-vit-base-patch32-image](https://huggingface.co/openai/clip-vit-base-patch32) | English | 224×224 | 512 | CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. CLIP allows to embed images and text into the same vector space. This allows to find similar images as well as to implement image search. This is the image encoder part of the CLIP model. To embed text checkout [clip-vit-base-patch32-text](../01-natural-language-processing/useTextEmbeddings.md#supported-models). | **`Image size`** - the size of an image that the model takes as an input. Resize will happen automatically. @@ -123,9 +123,9 @@ For the supported models, the returned embedding vector is normalized, meaning t Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. Performance also heavily depends on image size, because resize is expansive operation, especially on low-end devices. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 48 | 64 | 69 | 65 | 63 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 70 | 70 | 90 | 66 | 58 | :::info Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useOCR.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useOCR.md index a23acd17c..5a1e80cfc 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useOCR.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useOCR.md @@ -134,13 +134,13 @@ For more information on loading resources, take a look at [loading models](../.. The hook returns an object with the following properties: -| Field | Type | Description | -| ------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `OCRDetection` objects. | -| `error` | string | null | Contains the error message if the model loading failed. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | +| Field | Type | Description | +| ------------------ | -------------------------------------------------- | ------------------------------------------------------------------------------------------- | +| `forward` | `(imageSource: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `OCRDetection` objects. | +| `error` | string | null | Contains the error message if the model loading failed. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | ## Running the model @@ -288,20 +288,20 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc ### Model size -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_800) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_256) | 16 - 18\* | -| Recognizer (CRNN_128) | 17 - 19\* | +| Model | XNNPACK [MB] | +| ------------------------------ | :----------: | +| Detector (CRAFT_800_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_256) | 16 - 18\* | +| Recognizer (CRNN_128) | 17 - 19\* | \* - The model weights vary depending on the language. ### Memory usage -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_800) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1600 | 1700 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------------------------------------------------------------------------ | :--------------------: | :----------------: | +| Detector (CRAFT_800_QUANTIZED) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1400 | 1320 | ### Inference time @@ -317,18 +317,16 @@ Times presented in the tables are measured as consecutive runs of the model. Ini **Time measurements:** -| Metric | iPhone 14 Pro Max
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | -| ------------------------- | ----------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | -| **Total Inference Time** | 4330 | 2537 | ❌ | 6648 | 5993 | -| **Detector (CRAFT_800)** | 1945 | 1809 | ❌ | 2080 | 1961 | -| **Recognizer (CRNN_512)** | | | | | | -| ├─ Average Time | 273 | 76 | ❌ | 289 | 252 | -| ├─ Total Time (3 runs) | 820 | 229 | ❌ | 867 | 756 | -| **Recognizer (CRNN_256)** | | | | | | -| ├─ Average Time | 137 | 39 | ❌ | 260 | 229 | -| ├─ Total Time (7 runs) | 958 | 271 | ❌ | 1818 | 1601 | -| **Recognizer (CRNN_128)** | | | | | | -| ├─ Average Time | 68 | 18 | ❌ | 239 | 214 | -| ├─ Total Time (7 runs) | 478 | 124 | ❌ | 1673 | 1498 | - -❌ - Insufficient RAM. +| Metric | iPhone 17 Pro
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | +| ---------------------------------- | ------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | +| **Total Inference Time** | 1160 | 1144 | 1498 | 1567 | 1160 | +| **Detector (CRAFT_800_QUANTIZED)** | 669 | 649 | 825 | 541 | 474 | +| **Recognizer (CRNN_512)** | | | | | | +| ├─ Average Time | 48 | 47 | 60 | 91 | 72 | +| ├─ Total Time (3 runs) | 144 | 141 | 180 | 273 | 216 | +| **Recognizer (CRNN_256)** | | | | | | +| ├─ Average Time | 22 | 22 | 29 | 51 | 30 | +| ├─ Total Time (7 runs) | 154 | 154 | 203 | 357 | 210 | +| **Recognizer (CRNN_128)** | | | | | | +| ├─ Average Time | 11 | 11 | 14 | 28 | 17 | +| ├─ Total Time (7 runs) | 77 | 77 | 98 | 196 | 119 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useObjectDetection.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useObjectDetection.md index ac756d6a6..7f49e8389 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useObjectDetection.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useObjectDetection.md @@ -139,7 +139,7 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------------------------ | :--------------------: | :----------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 90 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 164 | 132 | ### Inference time @@ -147,6 +147,6 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 116 | 120 | 164 | 257 | 129 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useStyleTransfer.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useStyleTransfer.md index 899a619ca..2bedba325 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useStyleTransfer.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useStyleTransfer.md @@ -95,10 +95,10 @@ function App() { | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ---------------------------- | :--------------------: | :----------------: | -| STYLE_TRANSFER_CANDY | 950 | 350 | -| STYLE_TRANSFER_MOSAIC | 950 | 350 | -| STYLE_TRANSFER_UDNIE | 950 | 350 | -| STYLE_TRANSFER_RAIN_PRINCESS | 950 | 350 | +| STYLE_TRANSFER_CANDY | 1200 | 380 | +| STYLE_TRANSFER_MOSAIC | 1200 | 380 | +| STYLE_TRANSFER_UDNIE | 1200 | 380 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1200 | 380 | ### Inference time @@ -106,9 +106,9 @@ function App() { Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1356 | 1550 | 2003 | 2578 | 2328 | +| STYLE_TRANSFER_MOSAIC | 1376 | 1456 | 1971 | 2657 | 2394 | +| STYLE_TRANSFER_UDNIE | 1389 | 1499 | 1858 | 2380 | 2124 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1339 | 1514 | 2004 | 2608 | 2371 | diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useVerticalOCR.md b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useVerticalOCR.md index e15c08fbe..73c3fc108 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useVerticalOCR.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/02-computer-vision/useVerticalOCR.md @@ -147,13 +147,13 @@ For more information on loading resources, take a look at [loading models](../.. The hook returns an object with the following properties: -| Field | Type | Description | -| ------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `OCRDetection` objects. | -| `error` | string | null | Contains the error message if the model loading failed. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | +| Field | Type | Description | +| ------------------ | -------------------------------------------------- | ------------------------------------------------------------------------------------------- | +| `forward` | `(imageSource: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `OCRDetection` objects. | +| `error` | string | null | Contains the error message if the model loading failed. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | ## Running the model @@ -302,12 +302,12 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc ### Model size -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_1280) | 83.1 | -| Detector (CRAFT_320) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_64) | 15 - 16\* | +| Model | XNNPACK [MB] | +| ------------------------------- | :----------: | +| Detector (CRAFT_1280_QUANTIZED) | 19.8 | +| Detector (CRAFT_32_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_64) | 15 - 16\* | \* - The model weights vary depending on the language. @@ -315,8 +315,8 @@ You need to make sure the recognizer models you pass in `recognizerSources` matc | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 2172 | 2214 | -| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1774 | 1705 | +| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 1540 | 1470 | +| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1070 | 1000 | ### Inference time @@ -332,18 +332,16 @@ Times presented in the tables are measured as consecutive runs of the model. Ini **Time measurements:** -| Metric | iPhone 14 Pro Max
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | -| -------------------------------------------------------------------------- | ----------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | -| **Total Inference Time** | 9350 / 9620 | 8572 / 8621 | ❌ | 13737 / 10570 | 13436 / 9848 | -| **Detector (CRAFT_1250)** | 4895 | 4756 | ❌ | 5574 | 5016 | -| **Detector (CRAFT_320)** | | | | | | -| ├─ Average Time | 1247 | 1206 | ❌ | 1350 | 1356 | -| ├─ Total Time (3 runs) | 3741 | 3617 | ❌ | 4050 | 4069 | -| **Recognizer (CRNN_64)**
(_With Flag `independentChars == true`_) | | | | | | -| ├─ Average Time | 31 | 9 | ❌ | 195 | 207 | -| ├─ Total Time (21 runs) | 649 | 191 | ❌ | 4092 | 4339 | -| **Recognizer (CRNN_512)**
(_With Flag `independentChars == false`_) | | | | | | -| ├─ Average Time | 306 | 80 | ❌ | 308 | 250 | -| ├─ Total Time (3 runs) | 919 | 240 | ❌ | 925 | 751 | - -❌ - Insufficient RAM. +| Metric | iPhone 17 Pro
[ms] | iPhone 16 Pro
[ms] | iPhone SE 3 | Samsung Galaxy S24
[ms] | OnePlus 12
[ms] | +| -------------------------------------------------------------------------- | ------------------------- | ------------------------- | ----------- | ------------------------------ | ---------------------- | +| **Total Inference Time** | 3819 / 3716 | 3978 / 3841 | 4751 / 4532 | 3095 / 3286 | 2787 / 2770 | +| **Detector (CRAFT_1280_QUANTIZED)** | 1749 | 1804 | 2105 | 1216 | 1171 | +| **Detector (CRAFT_320_QUANTIZED)** | | | | | | +| ├─ Average Time | 458 | 474 | 561 | 360 | 332 | +| ├─ Total Time (4 runs) | 1832 | 1896 | 2244 | 1440 | 1328 | +| **Recognizer (CRNN_64)**
(_With Flag `independentChars == true`_) | | | | | | +| ├─ Average Time | 5 | 6 | 7 | 28 | 11 | +| ├─ Total Time (21 runs) | 105 | 126 | 147 | 588 | 231 | +| **Recognizer (CRNN_512)**
(_With Flag `independentChars == false`_) | | | | | | +| ├─ Average Time | 54 | 52 | 68 | 144 | 72 | +| ├─ Total Time (4 runs) | 216 | 208 | 272 | 576 | 288 | diff --git a/docs/versioned_docs/version-0.5.x/04-benchmarks/inference-time.md b/docs/versioned_docs/version-0.5.x/04-benchmarks/inference-time.md index 504c0f6e9..89f1f9de1 100644 --- a/docs/versioned_docs/version-0.5.x/04-benchmarks/inference-time.md +++ b/docs/versioned_docs/version-0.5.x/04-benchmarks/inference-time.md @@ -8,46 +8,48 @@ Times presented in the tables are measured as consecutive runs of the model. Ini ## Classification -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 100 | 120 | 130 | 180 | 170 | +| EFFICIENTNET_V2_S | 105 | 110 | 149 | 299 | 227 | ## Object Detection -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 13 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 190 | 260 | 280 | 100 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 116 | 120 | 164 | 257 | 129 | ## Style Transfer -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 13 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | | ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_MOSAIC | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | -| STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +| STYLE_TRANSFER_CANDY | 1356 | 1550 | 2003 | 2578 | 2328 | +| STYLE_TRANSFER_MOSAIC | 1376 | 1456 | 1971 | 2657 | 2394 | +| STYLE_TRANSFER_UDNIE | 1389 | 1499 | 1858 | 2380 | 2124 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1339 | 1514 | 2004 | 2608 | 2371 | ## OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_800) | 2099 | 2227 | ❌ | 2245 | 7108 | -| Recognizer (CRNN_512) | 70 | 252 | ❌ | 54 | 151 | -| Recognizer (CRNN_256) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_128) | 17 | 83 | ❌ | 14 | 39 | +Notice that the recognizer models were executed between 3 and 7 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_800_QUANTIZED) | 669 | 649 | 825 | 541 | 474 | +| Recognizer (CRNN_512) | 48 | 47 | 60 | 91 | 72 | +| Recognizer (CRNN_256) | 22 | 22 | 29 | 51 | 30 | +| Recognizer (CRNN_128) | 11 | 11 | 14 | 28 | 17 | ## Vertical OCR -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | -| --------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-------------------------------: | -| Detector (CRAFT_1280) | 5457 | 5833 | ❌ | 6296 | 14053 | -| Detector (CRAFT_320) | 1351 | 1460 | ❌ | 1485 | 3101 | -| Recognizer (CRNN_512) | 39 | 123 | ❌ | 24 | 78 | -| Recognizer (CRNN_64) | 10 | 33 | ❌ | 7 | 18 | +Notice that the recognizer models, as well as detector CRAFT_320 model, were executed between 4 and 21 times during a single recognition. +The values below represent the averages across all runs for the benchmark image. -❌ - Insufficient RAM. +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Detector (CRAFT_1280_QUANTIZED) | 1749 | 1804 | 2105 | 1216 | 1171 | +| Detector (CRAFT_320_QUANTIZED) | 458 | 474 | 561 | 360 | 332 | +| Recognizer (CRNN_512) | 54 | 52 | 68 | 144 | 72 | +| Recognizer (CRNN_64) | 5 | 6 | 7 | 28 | 11 | ## LLMs @@ -62,41 +64,31 @@ Times presented in the tables are measured as consecutive runs of the model. Ini ❌ - Insufficient RAM. -### Streaming mode - -Notice than for `Whisper` model which has to take as an input 30 seconds audio chunks (for shorter audio it is automatically padded with silence to 30 seconds) `fast` mode has the lowest latency (time from starting transcription to first token returned, caused by streaming algorithm), but the slowest speed. If you believe that this might be a problem for you, prefer `balanced` mode instead. - -| Model (mode) | iPhone 16 Pro (XNNPACK) [latency \| tokens/s] | iPhone 14 Pro (XNNPACK) [latency \| tokens/s] | iPhone SE 3 (XNNPACK) [latency \| tokens/s] | Samsung Galaxy S24 (XNNPACK) [latency \| tokens/s] | OnePlus 12 (XNNPACK) [latency \| tokens/s] | -| ------------------------- | :-------------------------------------------: | :-------------------------------------------: | :-----------------------------------------: | :------------------------------------------------: | :----------------------------------------: | -| Whisper-tiny (fast) | 2.8s \| 5.5t/s | 3.7s \| 4.4t/s | 4.4s \| 3.4t/s | 5.5s \| 3.1t/s | 5.3s \| 3.8t/s | -| Whisper-tiny (balanced) | 5.6s \| 7.9t/s | 7.0s \| 6.3t/s | 8.3s \| 5.0t/s | 8.4s \| 6.7t/s | 7.7s \| 7.2t/s | -| Whisper-tiny (quality) | 10.3s \| 8.3t/s | 12.6s \| 6.8t/s | 7.8s \| 8.9t/s | 13.5s \| 7.1t/s | 12.9s \| 7.5t/s | - ### Encoding Average time for encoding audio of given length over 10 runs. For `Whisper` model we only list 30 sec audio chunks since `Whisper` does not accept other lengths (for shorter audio the audio needs to be padded to 30sec with silence). -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| -------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Whisper-tiny (30s) | 1034 | 1344 | 1269 | 2916 | 2143 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Whisper-tiny (30s) | 1391 | 1372 | 1894 | 1303 | 1214 | ### Decoding -Average time for decoding one token in sequence of 100 tokens, with encoding context is obtained from audio of noted length. +Average time for decoding one token in sequence of approximately 100 tokens, with encoding context is obtained from audio of noted length. -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| -------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| Whisper-tiny (30s) | 128.03 | 113.65 | 141.63 | 89.08 | 84.49 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| Whisper-tiny (30s) | 53 | 53 | 74 | 100 | 84 | ## Text Embeddings -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 15 | 22 | 23 | 36 | 31 | -| ALL_MPNET_BASE_V2 | 71 | 96 | 101 | 112 | 105 | -| MULTI_QA_MINILM_L6_COS_V1 | 15 | 22 | 23 | 36 | 31 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 71 | 95 | 100 | 112 | 105 | -| CLIP_VIT_BASE_PATCH32_TEXT | 31 | 47 | 48 | 55 | 49 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| -------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| ALL_MINILM_L6_V2 | 16 | 16 | 19 | 54 | 28 | +| ALL_MPNET_BASE_V2 | 115 | 116 | 144 | 145 | 95 | +| MULTI_QA_MINILM_L6_COS_V1 | 16 | 16 | 20 | 47 | 28 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 112 | 119 | 144 | 146 | 96 | +| CLIP_VIT_BASE_PATCH32_TEXT | 47 | 45 | 57 | 65 | 48 | :::info Benchmark times for text embeddings are highly dependent on the sentence length. The numbers above are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. @@ -104,10 +96,16 @@ Benchmark times for text embeddings are highly dependent on the sentence length. ## Image Embeddings -| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :------------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 48 | 64 | 69 | 65 | 63 | +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 70 | 70 | 90 | 66 | 58 | :::info Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. ::: + +## Text to Image + +| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | +| --------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | +| BK_SDM_TINY_VPRED_256 | 21184 | 21021 | ❌ | 18834 | 16617 | diff --git a/docs/versioned_docs/version-0.5.x/04-benchmarks/memory-usage.md b/docs/versioned_docs/version-0.5.x/04-benchmarks/memory-usage.md index 684020e2a..a0c5a7b6d 100644 --- a/docs/versioned_docs/version-0.5.x/04-benchmarks/memory-usage.md +++ b/docs/versioned_docs/version-0.5.x/04-benchmarks/memory-usage.md @@ -2,69 +2,80 @@ title: Memory Usage --- +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 (Android). +::: + ## Classification | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ----------------- | :--------------------: | :----------------: | -| EFFICIENTNET_V2_S | 130 | 85 | +| EFFICIENTNET_V2_S | 230 | 87 | ## Object Detection | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------------------------ | :--------------------: | :----------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 90 | 90 | +| SSDLITE_320_MOBILENET_V3_LARGE | 164 | 132 | ## Style Transfer | Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | | ---------------------------- | :--------------------: | :----------------: | -| STYLE_TRANSFER_CANDY | 950 | 350 | -| STYLE_TRANSFER_MOSAIC | 950 | 350 | -| STYLE_TRANSFER_UDNIE | 950 | 350 | -| STYLE_TRANSFER_RAIN_PRINCESS | 950 | 350 | +| STYLE_TRANSFER_CANDY | 1200 | 380 | +| STYLE_TRANSFER_MOSAIC | 1200 | 380 | +| STYLE_TRANSFER_UDNIE | 1200 | 380 | +| STYLE_TRANSFER_RAIN_PRINCESS | 1200 | 380 | ## OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_800) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 2100 | 1782 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------------------------------------------------------------------------ | :--------------------: | :----------------: | +| Detector (CRAFT_800_QUANTIZED) + Recognizer (CRNN_512) + Recognizer (CRNN_256) + Recognizer (CRNN_128) | 1400 | 1320 | ## Vertical OCR -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| -------------------------------------------------------------------- | :--------------------: | :----------------: | -| Detector (CRAFT_1280) + Detector (CRAFT_320) + Recognizer (CRNN_512) | 2770 | 3720 | -| Detector(CRAFT_1280) + Detector(CRAFT_320) + Recognizer (CRNN_64) | 1770 | 2740 | +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ---------------------------------------------------------------------------------------- | :--------------------: | :----------------: | +| Detector (CRAFT_1280_QUANTIZED) + Detector (CRAFT_320_QUANTIZED) + Recognizer (CRNN_512) | 1540 | 1470 | +| Detector(CRAFT_1280_QUANTIZED) + Detector(CRAFT_320_QUANTIZED) + Recognizer (CRNN_64) | 1070 | 1000 | ## LLMs | Model | Android (XNNPACK) [GB] | iOS (XNNPACK) [GB] | | --------------------- | :--------------------: | :----------------: | -| LLAMA3_2_1B | 3.2 | 3.1 | -| LLAMA3_2_1B_SPINQUANT | 1.9 | 2 | -| LLAMA3_2_1B_QLORA | 2.2 | 2.5 | +| LLAMA3_2_1B | 3.3 | 3.1 | +| LLAMA3_2_1B_SPINQUANT | 1.9 | 2.4 | +| LLAMA3_2_1B_QLORA | 2.7 | 2.8 | | LLAMA3_2_3B | 7.1 | 7.3 | | LLAMA3_2_3B_SPINQUANT | 3.7 | 3.8 | -| LLAMA3_2_3B_QLORA | 4 | 4.1 | +| LLAMA3_2_3B_QLORA | 3.9 | 4.0 | ## Speech to text | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------ | :--------------------: | :----------------: | -| WHISPER_TINY | 900 | 600 | +| WHISPER_TINY | 410 | 375 | ## Text Embeddings | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------- | :--------------------: | :----------------: | -| ALL_MINILM_L6_V2 | 85 | 100 | -| ALL_MPNET_BASE_V2 | 390 | 465 | -| MULTI_QA_MINILM_L6_COS_V1 | 115 | 130 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 415 | 490 | -| CLIP_VIT_BASE_PATCH32_TEXT | 195 | 250 | +| ALL_MINILM_L6_V2 | 95 | 110 | +| ALL_MPNET_BASE_V2 | 405 | 455 | +| MULTI_QA_MINILM_L6_COS_V1 | 120 | 140 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 435 | 455 | +| CLIP_VIT_BASE_PATCH32_TEXT | 200 | 280 | ## Image Embeddings | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | --------------------------- | :--------------------: | :----------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 350 | 340 | +| CLIP_VIT_BASE_PATCH32_IMAGE | 345 | 340 | + +## Text to Image + +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| --------------------- | ---------------------- | ------------------ | +| BK_SDM_TINY_VPRED_256 | 2400 | 2400 | +| BK_SDM_TINY_VPRED | 6210 | 6050 | diff --git a/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md b/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md index 9d20c95d5..128cbd7fb 100644 --- a/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md +++ b/docs/versioned_docs/version-0.5.x/04-benchmarks/model-size.md @@ -25,23 +25,23 @@ title: Model Size ## OCR -| Model | XNNPACK [MB] | -| --------------------- | :----------: | -| Detector (CRAFT_800) | 83.1 | -| Recognizer (CRNN_512) | 15 - 18\* | -| Recognizer (CRNN_256) | 16 - 18\* | -| Recognizer (CRNN_128) | 17 - 19\* | +| Model | XNNPACK [MB] | +| ------------------------------ | :----------: | +| Detector (CRAFT_800_QUANTIZED) | 19.8 | +| Recognizer (CRNN_512) | 15 - 18\* | +| Recognizer (CRNN_256) | 16 - 18\* | +| Recognizer (CRNN_128) | 17 - 19\* | \* - The model weights vary depending on the language. ## Vertical OCR -| Model | XNNPACK [MB] | -| ------------------------ | :----------: | -| Detector (CRAFT_1280) | 83.1 | -| Detector (CRAFT_320) | 83.1 | -| Recognizer (CRNN_EN_512) | 15 - 18\* | -| Recognizer (CRNN_EN_64) | 15 - 16\* | +| Model | XNNPACK [MB] | +| ------------------------------- | :----------: | +| Detector (CRAFT_1280_QUANTIZED) | 19.8 | +| Detector (CRAFT_320_QUANTIZED) | 19.8 | +| Recognizer (CRNN_EN_512) | 15 - 18\* | +| Recognizer (CRNN_EN_64) | 15 - 16\* | \* - The model weights vary depending on the language. @@ -82,3 +82,9 @@ title: Model Size | Model | XNNPACK [MB] | | --------------------------- | :----------: | | CLIP_VIT_BASE_PATCH32_IMAGE | 352 | + +## Text to Image + +| Model | Text encoder (XNNPACK) [MB] | UNet (XNNPACK) [MB] | VAE decoder (XNNPACK) [MB] | +| ----------------- | --------------------------- | ------------------- | -------------------------- | +| BK_SDM_TINY_VPRED | 492 | 1290 | 198 | diff --git a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp index 95f6e9e55..c25fbd13f 100644 --- a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp @@ -108,22 +108,6 @@ void RnExecutorchInstaller::injectJSIBindings( threads::utils::unsafeSetupThreadPool(); threads::GlobalThreadPool::initialize(); - -#if defined(__ANDROID__) && defined(__aarch64__) - auto num_of_perf_cores = - ::executorch::extension::cpuinfo::get_num_performant_cores(); - log(LOG_LEVEL::Info, "Detected ", num_of_perf_cores, " performant cores"); - // setting num_of_cores to floor(num_of_perf_cores / 2) + 1) because depending - // on cpu arch as when possible we want to leave at least 2 performant cores - // for other tasks (setting more actually results in drop of performance). For - // older devices (i.e. samsung s22) resolves to 3 cores, and for newer ones - // (like OnePlus 12) resolves to 4, which when benchamrked gives highest - // throughput. - auto num_of_cores = static_cast(num_of_perf_cores / 2) + 1; - ::executorch::extension::threadpool::get_threadpool() - ->_unsafe_reset_threadpool(num_of_cores); - log(LOG_LEVEL::Info, "Configuring xnnpack for ", num_of_cores, " threads"); -#endif } } // namespace rnexecutorch diff --git a/packages/react-native-executorch/common/rnexecutorch/data_processing/dsp.cpp b/packages/react-native-executorch/common/rnexecutorch/data_processing/dsp.cpp index d3761dced..b1c8714a2 100644 --- a/packages/react-native-executorch/common/rnexecutorch/data_processing/dsp.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/data_processing/dsp.cpp @@ -1,6 +1,4 @@ -#include #include -#include #include #include #include @@ -18,48 +16,4 @@ std::vector hannWindow(size_t size) { return window; } -std::vector stftFromWaveform(std::span waveform, - size_t fftWindowSize, size_t hopSize) { - // Initialize FFT - FFT fft(fftWindowSize); - - const auto numFrames = 1 + (waveform.size() - fftWindowSize) / hopSize; - const auto numBins = fftWindowSize / 2; - const auto hann = hannWindow(fftWindowSize); - auto inBuffer = std::vector(fftWindowSize); - auto outBuffer = std::vector>(fftWindowSize); - - // Output magnitudes in dB - std::vector magnitudes; - magnitudes.reserve(numFrames * numBins); - const auto magnitudeScale = 1.0f / static_cast(fftWindowSize); - constexpr auto epsilon = std::numeric_limits::epsilon(); - constexpr auto dbConversionFactor = 20.0f; - - for (size_t t = 0; t < numFrames; ++t) { - const size_t offset = t * hopSize; - // Clear the input buffer first - std::ranges::fill(inBuffer, 0.0f); - - // Fill frame with windowed signal - const size_t samplesToRead = - std::min(fftWindowSize, waveform.size() - offset); - for (size_t i = 0; i < samplesToRead; i++) { - inBuffer[i] = waveform[offset + i] * hann[i]; - } - - fft.doFFT(inBuffer.data(), outBuffer); - - // Calculate magnitudes in dB (only positive frequencies) - for (size_t i = 0; i < numBins; i++) { - const auto magnitude = std::abs(outBuffer[i]) * magnitudeScale; - const auto magnitude_db = - dbConversionFactor * log10f(magnitude + epsilon); - magnitudes.push_back(magnitude_db); - } - } - - return magnitudes; -} - } // namespace rnexecutorch::dsp diff --git a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp index a1194de69..ee53c7d5a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp @@ -30,7 +30,7 @@ BaseModel::BaseModel(const std::string &modelSource, } std::vector BaseModel::getInputShape(std::string method_name, - int32_t index) { + int32_t index) const { if (!module_) { throw std::runtime_error("Model not loaded: Cannot get input shape"); } @@ -56,7 +56,7 @@ std::vector BaseModel::getInputShape(std::string method_name, } std::vector> -BaseModel::getAllInputShapes(std::string methodName) { +BaseModel::getAllInputShapes(std::string methodName) const { if (!module_) { throw std::runtime_error("Model not loaded: Cannot get all input shapes"); } @@ -88,7 +88,7 @@ BaseModel::getAllInputShapes(std::string methodName) { /// to JS. It is not meant to be used within C++. If you want to call forward /// from C++ on a BaseModel, please use BaseModel::forward. std::vector -BaseModel::forwardJS(std::vector tensorViewVec) { +BaseModel::forwardJS(std::vector tensorViewVec) const { if (!module_) { throw std::runtime_error("Model not loaded: Cannot perform forward pass"); } @@ -136,7 +136,7 @@ BaseModel::forwardJS(std::vector tensorViewVec) { } Result -BaseModel::getMethodMeta(const std::string &methodName) { +BaseModel::getMethodMeta(const std::string &methodName) const { if (!module_) { throw std::runtime_error("Model not loaded: Cannot get method meta!"); } @@ -161,7 +161,7 @@ BaseModel::forward(const std::vector &input_evalues) const { Result> BaseModel::execute(const std::string &methodName, - const std::vector &input_value) { + const std::vector &input_value) const { if (!module_) { throw std::runtime_error("Model not loaded, cannot run execute."); } @@ -175,7 +175,7 @@ std::size_t BaseModel::getMemoryLowerBound() const noexcept { void BaseModel::unload() noexcept { module_.reset(nullptr); } std::vector -BaseModel::getTensorShape(const executorch::aten::Tensor &tensor) { +BaseModel::getTensorShape(const executorch::aten::Tensor &tensor) const { auto sizes = tensor.sizes(); return std::vector(sizes.begin(), sizes.end()); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h index b7b7b54ed..cf2940429 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h @@ -25,18 +25,20 @@ class BaseModel { Module::LoadMode loadMode = Module::LoadMode::MmapUseMlockIgnoreErrors); std::size_t getMemoryLowerBound() const noexcept; void unload() noexcept; - std::vector getInputShape(std::string method_name, int32_t index); + std::vector getInputShape(std::string method_name, + int32_t index) const; std::vector> - getAllInputShapes(std::string methodName = "forward"); + getAllInputShapes(std::string methodName = "forward") const; std::vector - forwardJS(std::vector tensorViewVec); + forwardJS(std::vector tensorViewVec) const; Result> forward(const EValue &input_value) const; Result> forward(const std::vector &input_value) const; - Result> execute(const std::string &methodName, - const std::vector &input_value); + Result> + execute(const std::string &methodName, + const std::vector &input_value) const; Result - getMethodMeta(const std::string &methodName); + getMethodMeta(const std::string &methodName) const; protected: // If possible, models should not use the JS runtime to keep JSI internals @@ -49,7 +51,8 @@ class BaseModel { std::size_t memorySizeLowerBound{0}; private: - std::vector getTensorShape(const executorch::aten::Tensor &tensor); + std::vector + getTensorShape(const executorch::aten::Tensor &tensor) const; }; } // namespace models diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp index d0f965cb3..bf8f9fb86 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp @@ -4,7 +4,6 @@ #include "ASR.h" #include "executorch/extension/tensor/tensor_ptr.h" #include "rnexecutorch/data_processing/Numerical.h" -#include "rnexecutorch/data_processing/dsp.h" #include "rnexecutorch/data_processing/gzip.h" namespace rnexecutorch::models::speech_to_text::asr { @@ -37,8 +36,7 @@ ASR::getInitialSequence(const DecodingOptions &options) const { return seq; } -GenerationResult ASR::generate(std::span waveform, - float temperature, +GenerationResult ASR::generate(std::span waveform, float temperature, const DecodingOptions &options) const { std::vector encoderOutput = this->encode(waveform); @@ -94,7 +92,7 @@ float ASR::getCompressionRatio(const std::string &text) const { } std::vector -ASR::generateWithFallback(std::span waveform, +ASR::generateWithFallback(std::span waveform, const DecodingOptions &options) const { std::vector temperatures = {0.0f, 0.2f, 0.4f, 0.6f, 0.8f, 1.0f}; std::vector bestTokens; @@ -209,7 +207,7 @@ ASR::estimateWordLevelTimestampsLinear(std::span tokens, return wordObjs; } -std::vector ASR::transcribe(std::span waveform, +std::vector ASR::transcribe(std::span waveform, const DecodingOptions &options) const { int32_t seek = 0; std::vector results; @@ -218,7 +216,7 @@ std::vector ASR::transcribe(std::span waveform, int32_t start = seek * ASR::kSamplingRate; const auto end = std::min( (seek + ASR::kChunkSize) * ASR::kSamplingRate, waveform.size()); - std::span chunk = waveform.subspan(start, end - start); + auto chunk = waveform.subspan(start, end - start); if (std::cmp_less(chunk.size(), ASR::kMinChunkSamples)) { break; @@ -246,19 +244,12 @@ std::vector ASR::transcribe(std::span waveform, return results; } -std::vector ASR::encode(std::span waveform) const { - constexpr int32_t fftWindowSize = 512; - constexpr int32_t stftHopLength = 160; - constexpr int32_t innerDim = 256; - - std::vector preprocessedData = - dsp::stftFromWaveform(waveform, fftWindowSize, stftHopLength); - const auto numFrames = - static_cast(preprocessedData.size()) / innerDim; - std::vector inputShape = {numFrames, innerDim}; +std::vector ASR::encode(std::span waveform) const { + auto inputShape = {static_cast(waveform.size())}; const auto modelInputTensor = executorch::extension::make_tensor_ptr( - std::move(inputShape), std::move(preprocessedData)); + std::move(inputShape), waveform.data(), + executorch::runtime::etensor::ScalarType::Float); const auto encoderResult = this->encoder->forward(modelInputTensor); if (!encoderResult.ok()) { @@ -268,7 +259,7 @@ std::vector ASR::encode(std::span waveform) const { } const auto decoderOutputTensor = encoderResult.get().at(0).toTensor(); - const int32_t outputNumel = decoderOutputTensor.numel(); + const auto outputNumel = decoderOutputTensor.numel(); const float *const dataPtr = decoderOutputTensor.const_data_ptr(); return {dataPtr, dataPtr + outputNumel}; @@ -277,8 +268,10 @@ std::vector ASR::encode(std::span waveform) const { std::vector ASR::decode(std::span tokens, std::span encoderOutput) const { std::vector tokenShape = {1, static_cast(tokens.size())}; + auto tokensLong = std::vector(tokens.begin(), tokens.end()); + auto tokenTensor = executorch::extension::make_tensor_ptr( - std::move(tokenShape), tokens.data(), ScalarType::Int); + tokenShape, tokensLong.data(), ScalarType::Long); const auto encoderOutputSize = static_cast(encoderOutput.size()); std::vector encShape = {1, ASR::kNumFrames, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h index 20180ebe4..a0ea7e181 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h @@ -14,9 +14,9 @@ class ASR { const models::BaseModel *decoder, const TokenizerModule *tokenizer); std::vector - transcribe(std::span waveform, + transcribe(std::span waveform, const types::DecodingOptions &options) const; - std::vector encode(std::span waveform) const; + std::vector encode(std::span waveform) const; std::vector decode(std::span tokens, std::span encoderOutput) const; @@ -44,11 +44,10 @@ class ASR { std::vector getInitialSequence(const types::DecodingOptions &options) const; - types::GenerationResult generate(std::span waveform, - float temperature, + types::GenerationResult generate(std::span waveform, float temperature, const types::DecodingOptions &options) const; std::vector - generateWithFallback(std::span waveform, + generateWithFallback(std::span waveform, const types::DecodingOptions &options) const; std::vector calculateWordLevelTimestamps(std::span tokens, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp index d07dbfb3c..dbc974706 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include namespace rnexecutorch::models::voice_activity_detection { @@ -158,4 +157,4 @@ VoiceActivityDetection::postprocess(const std::vector &scores, return speechSegments; } -} // namespace rnexecutorch::models::voice_activity_detection \ No newline at end of file +} // namespace rnexecutorch::models::voice_activity_detection diff --git a/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h b/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h index 8b61080f8..50025eeeb 100644 --- a/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h +++ b/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -38,6 +39,9 @@ class GlobalThreadPool { numThreads, "threads"); instance = std::make_unique(numThreads.value(), config); + // Disable OpenCV's internal threading to prevent it from overriding our + // thread pool configuration, which would cause degraded performance + cv::setNumThreads(0); }); } diff --git a/packages/react-native-executorch/src/constants/directories.ts b/packages/react-native-executorch/src/constants/directories.ts index ac20d04d8..3cc6e68a9 100644 --- a/packages/react-native-executorch/src/constants/directories.ts +++ b/packages/react-native-executorch/src/constants/directories.ts @@ -1,3 +1,5 @@ -import { documentDirectory } from 'expo-file-system'; +import { importLegacyExpoFSModules } from '../utils/ResourceFetcher'; + +const { documentDirectory } = importLegacyExpoFSModules(); export const RNEDirectory = `${documentDirectory}react-native-executorch/`; diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 57381cf15..50e7ef5a8 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -2,8 +2,8 @@ import { Platform } from 'react-native'; const URL_PREFIX = 'https://huggingface.co/software-mansion/react-native-executorch'; -const VERSION_TAG = 'resolve/v0.5.0'; -const NEXT_VERSION_TAG = 'resolve/v0.6.0'; +const VERSION_TAG = 'resolve/v0.6.0'; +// const NEXT_VERSION_TAG = 'resolve/v0.7.0'; // LLMs @@ -311,6 +311,9 @@ const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/ const WHISPER_TINY_EN_ENCODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_encoder_xnnpack.pte`; const WHISPER_TINY_EN_DECODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_decoder_xnnpack.pte`; +const WHISPER_TINY_EN_ENCODER_QUANTIZED = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/xnnpack/whisper_tiny_quantized_en_encoder_xnnpack.pte`; +const WHISPER_TINY_EN_DECODER_QUANTIZED = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/xnnpack/whisper_tiny_quantized_en_decoder_xnnpack.pte`; + const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`; const WHISPER_BASE_EN_ENCODER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_encoder_xnnpack.pte`; const WHISPER_BASE_EN_DECODER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_decoder_xnnpack.pte`; @@ -338,6 +341,13 @@ export const WHISPER_TINY_EN = { tokenizerSource: WHISPER_TINY_EN_TOKENIZER, }; +export const WHISPER_TINY_EN_QUANTIZED = { + isMultilingual: false, + encoderSource: WHISPER_TINY_EN_ENCODER_QUANTIZED, + decoderSource: WHISPER_TINY_EN_DECODER_QUANTIZED, + tokenizerSource: WHISPER_TINY_EN_TOKENIZER, +}; + export const WHISPER_BASE_EN = { isMultilingual: false, encoderSource: WHISPER_BASE_EN_ENCODER, @@ -442,7 +452,7 @@ export const BK_SDM_TINY_VPRED_256 = { }; // Voice Activity Detection -const FSMN_VAD_MODEL = `${URL_PREFIX}-fsmn-vad/${NEXT_VERSION_TAG}/xnnpack/fsmn-vad_xnnpack.pte`; +const FSMN_VAD_MODEL = `${URL_PREFIX}-fsmn-vad/${VERSION_TAG}/xnnpack/fsmn-vad_xnnpack.pte`; export const FSMN_VAD = { modelSource: FSMN_VAD_MODEL, diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index bcc131eba..bbc113a76 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -1,9 +1,11 @@ import { ResourceSource } from '../types/common'; -import { ResourceFetcher } from '../utils/ResourceFetcher'; +import { + importLegacyExpoFSModules, + ResourceFetcher, +} from '../utils/ResourceFetcher'; import { ETError, getError } from '../Error'; import { Template } from '@huggingface/jinja'; import { DEFAULT_CHAT_CONFIG } from '../constants/llmDefaults'; -import { readAsStringAsync } from 'expo-file-system'; import { ChatConfig, GenerationConfig, @@ -14,6 +16,7 @@ import { } from '../types/llm'; import { parseToolCall } from '../utils/llm'; import { Logger } from '../common/Logger'; +const { readAsStringAsync } = importLegacyExpoFSModules(); export class LLMController { private nativeModule: any; diff --git a/packages/react-native-executorch/src/utils/ResourceFetcher.ts b/packages/react-native-executorch/src/utils/ResourceFetcher.ts index fa2fd8c09..efc16ab63 100644 --- a/packages/react-native-executorch/src/utils/ResourceFetcher.ts +++ b/packages/react-native-executorch/src/utils/ResourceFetcher.ts @@ -27,7 +27,27 @@ * - Implements linked list behavior via the `.next` attribute * - Automatically processes subsequent downloads when `.next` contains a valid resource */ -import { +import type * as FileSystemTypes from 'expo-file-system'; + +export function importLegacyExpoFSModules() { + let FileSystem: typeof FileSystemTypes; + + try { + const expoPkg = require('expo/package.json'); + const sdkVersion = expoPkg.version.split('.')[0]; + + if (Number(sdkVersion) > 53) { + FileSystem = require('expo-file-system/legacy'); + } else { + FileSystem = require('expo-file-system'); + } + } catch (e) { + throw new Error('Expo must be installed to use react-native-executorch'); + } + return FileSystem; +} + +const { cacheDirectory, copyAsync, createDownloadResumable, @@ -37,7 +57,8 @@ import { EncodingType, deleteAsync, readDirectoryAsync, -} from 'expo-file-system'; +} = importLegacyExpoFSModules(); + import { Asset } from 'expo-asset'; import { Platform } from 'react-native'; import { RNEDirectory } from '../constants/directories'; diff --git a/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts b/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts index 67d6edc9b..d36a9ba5e 100644 --- a/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts +++ b/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts @@ -1,16 +1,14 @@ -/** - * @internal - */ - -import { - DownloadResumable, - getInfoAsync, - makeDirectoryAsync, -} from 'expo-file-system'; +import type * as FileSystemTypes from 'expo-file-system'; import { RNEDirectory } from '../constants/directories'; import { ResourceSource } from '../types/common'; import { Asset } from 'expo-asset'; import { Logger } from '../common/Logger'; +import { importLegacyExpoFSModules } from './ResourceFetcher'; + +/** + * @internal + */ +const { getInfoAsync, makeDirectoryAsync } = importLegacyExpoFSModules(); export const enum HTTP_CODE { OK = 200, @@ -42,7 +40,7 @@ export interface ResourceSourceExtended { } export interface DownloadResource { - downloadResumable: DownloadResumable; + downloadResumable: FileSystemTypes.DownloadResumable; status: DownloadStatus; extendedInfo: ResourceSourceExtended; } @@ -75,7 +73,7 @@ export namespace ResourceFetcherUtils { let totalLength = 0; let previousFilesTotalLength = 0; for (const source of sources) { - const type = await ResourceFetcherUtils.getType(source); + const type = ResourceFetcherUtils.getType(source); let length = 0; try { if (type === SourceType.REMOTE_FILE && typeof source === 'string') {