WebKit · danleh · Feb 10, 2025 · Aug 14, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/JetStreamDriver.js b/JetStreamDriver.js
@@ -579,6 +579,7 @@ class Scripts {
         this.add(`
             performance.mark ??= function(name) { return { name }};
             performance.measure ??= function() {};
+            performance.timeOrigin ??= performance.now();
         `);
     }
 
@@ -2210,6 +2211,54 @@ let BENCHMARKS = [
         worstCaseCount: 2,
         tags: ["Default", "Wasm"],
     }),
+    new AsyncBenchmark({
+        name: "transformersjs-bert-wasm",
+        files: [
+            "./polyfills/fast-text-encoding-1.0.3/text.js",
+            "./transformersjs/benchmark.js",
+            "./transformersjs/task-bert.js",
+        ],
+        preload: {
+            transformersJsModule: "./transformersjs/build/transformers.js",
+
+            onnxJsModule: "./transformersjs/build/onnxruntime-web/ort-wasm-simd-threaded.mjs",
+            onnxWasmBinary: "./transformersjs/build/onnxruntime-web/ort-wasm-simd-threaded.wasm",
+
+            modelWeights: "./transformersjs/build/models/Xenova/distilbert-base-uncased-finetuned-sst-2-english/onnx/model_uint8.onnx",
+            modelConfig: "./transformersjs/build/models/Xenova/distilbert-base-uncased-finetuned-sst-2-english/config.json",
+            modelTokenizer: "./transformersjs/build/models/Xenova/distilbert-base-uncased-finetuned-sst-2-english/tokenizer.json",
+            modelTokenizerConfig: "./transformersjs/build/models/Xenova/distilbert-base-uncased-finetuned-sst-2-english/tokenizer_config.json",
+        },
+        iterations: 30,
+        tags: ["Default", "Wasm"],
+    }),
+    new AsyncBenchmark({
+        name: "transformersjs-whisper-wasm",
+        files: [
+            "./polyfills/fast-text-encoding-1.0.3/text.js",
+            "./transformersjs/benchmark.js",
+            "./transformersjs/task-whisper.js",
+        ],
+        preload: {
+            transformersJsModule: "./transformersjs/build/transformers.js",
+
+            onnxJsModule: "./transformersjs/build/onnxruntime-web/ort-wasm-simd-threaded.mjs",
+            onnxWasmBinary: "./transformersjs/build/onnxruntime-web/ort-wasm-simd-threaded.wasm",
+
+            modelEncoderWeights: "./transformersjs/build/models/Xenova/whisper-tiny.en/onnx/encoder_model_quantized.onnx",
+            modelDecoderWeights: "./transformersjs/build/models/Xenova/whisper-tiny.en/onnx/decoder_model_merged_quantized.onnx",
+            modelConfig: "./transformersjs/build/models/Xenova/whisper-tiny.en/config.json",
+            modelTokenizer: "./transformersjs/build/models/Xenova/whisper-tiny.en/tokenizer.json",
+            modelTokenizerConfig: "./transformersjs/build/models/Xenova/whisper-tiny.en/tokenizer_config.json",
+            modelPreprocessorConfig: "./transformersjs/build/models/Xenova/whisper-tiny.en/preprocessor_config.json",
+            modelGenerationConfig: "./transformersjs/build/models/Xenova/whisper-tiny.en/generation_config.json",
+
+            inputFile: "./transformersjs/build/inputs/jfk.raw",
+        },
+        iterations: 5,
+        worstCaseCount: 1,
+        tags: ["Default", "Wasm"],
+    }),
     new WasmLegacyBenchmark({
         name: "tfjs-wasm",
         files: [

diff --git a/polyfills/fast-text-encoding-1.0.3/text.js b/polyfills/fast-text-encoding-1.0.3/text.js
@@ -0,0 +1,286 @@
+/*
+ * Copyright 2017 Sam Thorogood. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+/**
+ * @fileoverview Polyfill for TextEncoder and TextDecoder.
+ *
+ * You probably want `text.min.js`, and not this file directly.
+ */
+
+//JSC
+const global = (0,eval)("this");
+
+(function(scope) {
+'use strict';
+
+// fail early
+if (scope['TextEncoder'] && scope['TextDecoder']) {
+  return false;
+}
+
+// used for FastTextDecoder
+const validUtfLabels = ['utf-8', 'utf8', 'unicode-1-1-utf-8'];
+
+/**
+ * @constructor
+ */
+function FastTextEncoder() {
+  // This does not accept an encoding, and always uses UTF-8:
+  //   https://www.w3.org/TR/encoding/#dom-textencoder
+}
+
+Object.defineProperty(FastTextEncoder.prototype, 'encoding', {value: 'utf-8'});
+
+/**
+ * @param {string} string
+ * @param {{stream: boolean}=} options
+ * @return {!Uint8Array}
+ */
+FastTextEncoder.prototype['encode'] = function(string, options={stream: false}) {
+  if (options.stream) {
+    throw new Error(`Failed to encode: the 'stream' option is unsupported.`);
+  }
+
+  let pos = 0;
+  const len = string.length;
+
+  let at = 0;  // output position
+  let tlen = Math.max(32, len + (len >>> 1) + 7);  // 1.5x size
+  let target = new Uint8Array((tlen >>> 3) << 3);  // ... but at 8 byte offset
+
+  while (pos < len) {
+    let value = string.charCodeAt(pos++);
+    if (value >= 0xd800 && value <= 0xdbff) {
+      // high surrogate
+      if (pos < len) {
+        const extra = string.charCodeAt(pos);
+        if ((extra & 0xfc00) === 0xdc00) {
+          ++pos;
+          value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000;
+        }
+      }
+      if (value >= 0xd800 && value <= 0xdbff) {
+        continue;  // drop lone surrogate
+      }
+    }
+
+    // expand the buffer if we couldn't write 4 bytes
+    if (at + 4 > target.length) {
+      tlen += 8;  // minimum extra
+      tlen *= (1.0 + (pos / string.length) * 2);  // take 2x the remaining
+      tlen = (tlen >>> 3) << 3;  // 8 byte offset
+
+      const update = new Uint8Array(tlen);
+      update.set(target);
+      target = update;
+    }
+
+    if ((value & 0xffffff80) === 0) {  // 1-byte
+      target[at++] = value;  // ASCII
+      continue;
+    } else if ((value & 0xfffff800) === 0) {  // 2-byte
+      target[at++] = ((value >>>  6) & 0x1f) | 0xc0;
+    } else if ((value & 0xffff0000) === 0) {  // 3-byte
+      target[at++] = ((value >>> 12) & 0x0f) | 0xe0;
+      target[at++] = ((value >>>  6) & 0x3f) | 0x80;
+    } else if ((value & 0xffe00000) === 0) {  // 4-byte
+      target[at++] = ((value >>> 18) & 0x07) | 0xf0;
+      target[at++] = ((value >>> 12) & 0x3f) | 0x80;
+      target[at++] = ((value >>>  6) & 0x3f) | 0x80;
+    } else {
+      continue;  // out of range
+    }
+
+    target[at++] = (value & 0x3f) | 0x80;
+  }
+
+  // Use subarray if slice isn't supported (IE11). This will use more memory
+  // because the original array still exists.
+  return target.slice ? target.slice(0, at) : target.subarray(0, at);
+}
+
+/**
+ * @constructor
+ * @param {string=} utfLabel
+ * @param {{fatal: boolean}=} options
+ */
+function FastTextDecoder(utfLabel='utf-8', options={fatal: false}) {
+  if (validUtfLabels.indexOf(utfLabel.toLowerCase()) === -1) {
+    throw new RangeError(
+      `Failed to construct 'TextDecoder': The encoding label provided ('${utfLabel}') is invalid.`);
+  }
+  // if (options.fatal) {
+  //   throw new Error(`Failed to construct 'TextDecoder': the 'fatal' option is unsupported.`);
+  // }
+}
+
+Object.defineProperty(FastTextDecoder.prototype, 'encoding', {value: 'utf-8'});
+
+Object.defineProperty(FastTextDecoder.prototype, 'fatal', {value: false});
+
+Object.defineProperty(FastTextDecoder.prototype, 'ignoreBOM', {value: false});
+
+/**
+ * @param {!Uint8Array} bytes
+ * @return {string}
+ */
+function decodeBuffer(bytes) {
+  return Buffer.from(bytes.buffer, bytes.byteOffset, bytes.byteLength).toString('utf-8');
+}
+
+/**
+ * @param {!Uint8Array} bytes
+ * @return {string}
+ */
+function decodeSyncXHR(bytes) {
+  const b = new Blob([bytes], {type: 'text/plain;charset=UTF-8'});
+  const u = URL.createObjectURL(b);
+
+  // This hack will fail in non-Edgium Edge because sync XHRs are disabled (and
+  // possibly in other places), so ensure there's a fallback call.
+  try {
+    const x = new XMLHttpRequest();
+    x.open('GET', u, false);
+    x.send();
+    return x.responseText;
+  } catch (e) {
+    return decodeFallback(bytes);
+  } finally {
+    URL.revokeObjectURL(u);
+  }
+}
+
+/**
+ * @param {!Uint8Array} bytes
+ * @return {string}
+ */
+function decodeFallback(bytes) {
+  let inputIndex = 0;
+
+  // Create a working buffer for UTF-16 code points, but don't generate one
+  // which is too large for small input sizes. UTF-8 to UCS-16 conversion is
+  // going to be at most 1:1, if all code points are ASCII. The other extreme
+  // is 4-byte UTF-8, which results in two UCS-16 points, but this is still 50%
+  // fewer entries in the output.
+  const pendingSize = Math.min(256 * 256, bytes.length + 1);
+  const pending = new Uint16Array(pendingSize);
+  const chunks = [];
+  let pendingIndex = 0;
+
+  for (;;) {
+    const more = inputIndex < bytes.length;
+
+    // If there's no more data or there'd be no room for two UTF-16 values,
+    // create a chunk. This isn't done at the end by simply slicing the data
+    // into equal sized chunks as we might hit a surrogate pair.
+    if (!more || (pendingIndex >= pendingSize - 1)) {
+      // nb. .apply and friends are *really slow*. Low-hanging fruit is to
+      // expand this to literally pass pending[0], pending[1], ... etc, but
+      // the output code expands pretty fast in this case.
+      chunks.push(String.fromCharCode.apply(null, pending.subarray(0, pendingIndex)));
+
+      if (!more) {
+        return chunks.join('');
+      }
+
+      // Move the buffer forward and create another chunk.
+      bytes = bytes.subarray(inputIndex);
+      inputIndex = 0;
+      pendingIndex = 0;
+    }
+
+    // The native TextDecoder will generate "REPLACEMENT CHARACTER" where the
+    // input data is invalid. Here, we blindly parse the data even if it's
+    // wrong: e.g., if a 3-byte sequence doesn't have two valid continuations.
+
+    const byte1 = bytes[inputIndex++];
+    if ((byte1 & 0x80) === 0) {  // 1-byte or null
+      pending[pendingIndex++] = byte1;
+    } else if ((byte1 & 0xe0) === 0xc0) {  // 2-byte
+      const byte2 = bytes[inputIndex++] & 0x3f;
+      pending[pendingIndex++] = ((byte1 & 0x1f) << 6) | byte2;
+    } else if ((byte1 & 0xf0) === 0xe0) {  // 3-byte
+      const byte2 = bytes[inputIndex++] & 0x3f;
+      const byte3 = bytes[inputIndex++] & 0x3f;
+      pending[pendingIndex++] = ((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3;
+    } else if ((byte1 & 0xf8) === 0xf0) {  // 4-byte
+      const byte2 = bytes[inputIndex++] & 0x3f;
+      const byte3 = bytes[inputIndex++] & 0x3f;
+      const byte4 = bytes[inputIndex++] & 0x3f;
+
+      // this can be > 0xffff, so possibly generate surrogates
+      let codepoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
+      if (codepoint > 0xffff) {
+        // codepoint &= ~0x10000;
+        codepoint -= 0x10000;
+        pending[pendingIndex++] = (codepoint >>> 10) & 0x3ff | 0xd800;
+        codepoint = 0xdc00 | codepoint & 0x3ff;
+      }
+      pending[pendingIndex++] = codepoint;
+    } else {
+      // invalid initial byte
+    }
+  }
+}
+
+// Decoding a string is pretty slow, but use alternative options where possible.
+let decodeImpl = decodeFallback;
+if (typeof Buffer === 'function' && Buffer.from) {
+  // Buffer.from was added in Node v5.10.0 (2015-11-17).
+  decodeImpl = decodeBuffer;
+} else if (typeof Blob === 'function' && typeof URL === 'function' && typeof URL.createObjectURL === 'function') {
+  // Blob and URL.createObjectURL are available from IE10, Safari 6, Chrome 19
+  // (all released in 2012), Firefox 19 (2013), ...
+  decodeImpl = decodeSyncXHR;
+}
+
+/**
+ * @param {(!ArrayBuffer|!ArrayBufferView)} buffer
+ * @param {{stream: boolean}=} options
+ * @return {string}
+ */
+FastTextDecoder.prototype['decode'] = function(buffer, options={stream: false}) {
+  if (options['stream']) {
+    throw new Error(`Failed to decode: the 'stream' option is unsupported.`);
+  }
+
+  if (!buffer)
+    return;
+
+  let bytes;
+
+  if (buffer instanceof Uint8Array) {
+    // Accept Uint8Array instances as-is.
+    bytes = buffer;
+  } else if (buffer.buffer instanceof ArrayBuffer) {
+    // Look for ArrayBufferView, which isn't a real type, but basically
+    // represents all the valid TypedArray types plus DataView. They all have
+    // ".buffer" as an instance of ArrayBuffer.
+    bytes = new Uint8Array(buffer.buffer);
+  } else {
+    // The only other valid argument here is that "buffer" is an ArrayBuffer.
+    // We also try to convert anything else passed to a Uint8Array, as this
+    // catches anything that's array-like. Native code would throw here.
+    bytes = new Uint8Array(buffer);
+  }
+
+  return decodeImpl(/** @type {!Uint8Array} */ (bytes));
+}
+
+scope['TextEncoder'] = FastTextEncoder;
+scope['TextDecoder'] = FastTextDecoder;
+
+}(typeof window !== 'undefined' ? window : (typeof global !== 'undefined' ? global : this)));
diff --git a/sqlite3/benchmark.js b/sqlite3/benchmark.js
@@ -18,7 +18,7 @@ class TextEncoder {
     return Uint8Array.from(string, (char) => {
       let byte = char.codePointAt(0);
       if (byte > 0x7f)
-        throw new Error("TextEncoder polyfill only supports ASCII");
+        throw new Error("TextEncoder polyfill only supports ASCII, got: " + char);
       return byte;
     });
   }
@@ -27,7 +27,7 @@ class TextDecoder {
   decode(array) {
     for (let byte of array) {
       if (byte > 0x7f)
-        throw new Error("TextDecoder polyfill only supports ASCII");
+        throw new Error("TextDecoder polyfill only supports ASCII, got: " + byte);
     }
     return String.fromCharCode.apply(null, array);
   }

diff --git a/transformersjs/.gitignore b/transformersjs/.gitignore
@@ -0,0 +1,2 @@
+/util/node_modules/
+/util/package-lock.json
diff --git a/transformersjs/README.md b/transformersjs/README.md
@@ -0,0 +1,12 @@
+- Two tasks: one text/NLP, one audio processing/speech-to-text.
+- Everything in `build/` is generated or an upstream library.
+- Everything in `util/` is tooling for building and preparing the benchmark.
+
+# Licenses
+
+- Transformers.js: Apache 2.0, https://github.com/huggingface/transformers.js/blob/main/LICENSE
+- ONNX runtime: MIT, https://github.com/microsoft/onnxruntime/blob/main/LICENSE
+- `text-encoding` Polyfill: Unlicense OR Apache 2.0, https://github.com/inexorabletash/text-encoding/blob/master/LICENSE.md
+- Model `DistilBERT base uncased finetuned SST-2`: Apache 2.0, https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english
+- Model `openai/whisper-tiny.en`: Apache 2.0, https://huggingface.co/openai/whisper-tiny.en
+- Audio file for speech-to-text task: Public domain, https://www.jfklibrary.org/learn/about-jfk/historic-speeches/inaugural-address