diff --git a/.github/workflows/blake3-wasm-publish.yml b/.github/workflows/blake3-wasm-publish.yml new file mode 100644 index 0000000000..30018ffda8 --- /dev/null +++ b/.github/workflows/blake3-wasm-publish.yml @@ -0,0 +1,64 @@ +name: Blake3 WASM - Version and Release + +on: + workflow_dispatch: + inputs: + newversion: + type: choice + description: "Semantic Version Bump Type" + default: patch + options: + - patch + - minor + - major + +concurrency: + group: "push-to-main" + +defaults: + run: + working-directory: packages/blake3-wasm + +jobs: + version_and_release: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + # Needed to push the tag and the commit on the main branch, otherwise we get: + # > Run git push --follow-tags + # remote: error: GH006: Protected branch update failed for refs/heads/main. + # remote: error: Changes must be made through a pull request. Required status check "lint" is expected. + token: ${{ secrets.BOT_ACCESS_TOKEN }} + - run: npm install -g corepack@latest && corepack enable + - uses: actions/setup-node@v3 + with: + node-version: "20" + cache: "pnpm" + cache-dependency-path: | + packages/blake3-wasm/pnpm-lock.yaml + # setting a registry enables the NODE_AUTH_TOKEN env variable where we can set an npm token. REQUIRED + registry-url: "https://registry.npmjs.org" + - run: pnpm install + - run: git config --global user.name machineuser + - run: git config --global user.email infra+machineuser@huggingface.co + - run: | + PACKAGE_VERSION=$(node -p "require('./package.json').version") + BUMPED_VERSION=$(node -p "require('semver').inc('$PACKAGE_VERSION', '${{ github.event.inputs.newversion }}')") + # Update package.json with the new version + node -e "const fs = require('fs'); const package = JSON.parse(fs.readFileSync('./package.json')); package.version = '$BUMPED_VERSION'; fs.writeFileSync('./package.json', JSON.stringify(package, null, '\t') + '\n');" + git commit . -m "🔖 @huggingface/blake3-wasm $BUMPED_VERSION" + git tag "blake3-wasm-v$BUMPED_VERSION" + - run: pnpm publish --no-git-checks . + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} + - run: (git pull --rebase && git push --follow-tags) || (git pull --rebase && git push --follow-tags) + # hack - reuse actions/setup-node@v3 just to set a new registry + - uses: actions/setup-node@v3 + with: + node-version: "20" + registry-url: "https://npm.pkg.github.com" + # Disable for now, until github supports PATs for writing github packages (https://github.com/github/roadmap/issues/558) + # - run: pnpm publish --no-git-checks . + # env: + # NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/gearhash-wasm-publish.yml b/.github/workflows/gearhash-wasm-publish.yml new file mode 100644 index 0000000000..4070718785 --- /dev/null +++ b/.github/workflows/gearhash-wasm-publish.yml @@ -0,0 +1,64 @@ +name: Gearhash WASM - Version and Release + +on: + workflow_dispatch: + inputs: + newversion: + type: choice + description: "Semantic Version Bump Type" + default: patch + options: + - patch + - minor + - major + +concurrency: + group: "push-to-main" + +defaults: + run: + working-directory: packages/gearhash-wasm + +jobs: + version_and_release: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + # Needed to push the tag and the commit on the main branch, otherwise we get: + # > Run git push --follow-tags + # remote: error: GH006: Protected branch update failed for refs/heads/main. + # remote: error: Changes must be made through a pull request. Required status check "lint" is expected. + token: ${{ secrets.BOT_ACCESS_TOKEN }} + - run: npm install -g corepack@latest && corepack enable + - uses: actions/setup-node@v3 + with: + node-version: "20" + cache: "pnpm" + cache-dependency-path: | + packages/gearhash-wasm/pnpm-lock.yaml + # setting a registry enables the NODE_AUTH_TOKEN env variable where we can set an npm token. REQUIRED + registry-url: "https://registry.npmjs.org" + - run: pnpm install + - run: git config --global user.name machineuser + - run: git config --global user.email infra+machineuser@huggingface.co + - run: | + PACKAGE_VERSION=$(node -p "require('./package.json').version") + BUMPED_VERSION=$(node -p "require('semver').inc('$PACKAGE_VERSION', '${{ github.event.inputs.newversion }}')") + # Update package.json with the new version + node -e "const fs = require('fs'); const package = JSON.parse(fs.readFileSync('./package.json')); package.version = '$BUMPED_VERSION'; fs.writeFileSync('./package.json', JSON.stringify(package, null, '\t') + '\n');" + git commit . -m "🔖 @huggingface/gearhash-wasm $BUMPED_VERSION" + git tag "gearhash-wasm-v$BUMPED_VERSION" + - run: pnpm publish --no-git-checks . + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} + - run: (git pull --rebase && git push --follow-tags) || (git pull --rebase && git push --follow-tags) + # hack - reuse actions/setup-node@v3 just to set a new registry + - uses: actions/setup-node@v3 + with: + node-version: "20" + registry-url: "https://npm.pkg.github.com" + # Disable for now, until github supports PATs for writing github packages (https://github.com/github/roadmap/issues/558) + # - run: pnpm publish --no-git-checks . + # env: + # NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/splitmix64-wasm-publish.yml b/.github/workflows/splitmix64-wasm-publish.yml new file mode 100644 index 0000000000..f35f7c0a5b --- /dev/null +++ b/.github/workflows/splitmix64-wasm-publish.yml @@ -0,0 +1,64 @@ +name: Splitmix64 WASM - Version and Release + +on: + workflow_dispatch: + inputs: + newversion: + type: choice + description: "Semantic Version Bump Type" + default: patch + options: + - patch + - minor + - major + +concurrency: + group: "push-to-main" + +defaults: + run: + working-directory: packages/splitmix64-wasm + +jobs: + version_and_release: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + # Needed to push the tag and the commit on the main branch, otherwise we get: + # > Run git push --follow-tags + # remote: error: GH006: Protected branch update failed for refs/heads/main. + # remote: error: Changes must be made through a pull request. Required status check "lint" is expected. + token: ${{ secrets.BOT_ACCESS_TOKEN }} + - run: npm install -g corepack@latest && corepack enable + - uses: actions/setup-node@v3 + with: + node-version: "20" + cache: "pnpm" + cache-dependency-path: | + packages/splitmix64-wasm/pnpm-lock.yaml + # setting a registry enables the NODE_AUTH_TOKEN env variable where we can set an npm token. REQUIRED + registry-url: "https://registry.npmjs.org" + - run: pnpm install + - run: git config --global user.name machineuser + - run: git config --global user.email infra+machineuser@huggingface.co + - run: | + PACKAGE_VERSION=$(node -p "require('./package.json').version") + BUMPED_VERSION=$(node -p "require('semver').inc('$PACKAGE_VERSION', '${{ github.event.inputs.newversion }}')") + # Update package.json with the new version + node -e "const fs = require('fs'); const package = JSON.parse(fs.readFileSync('./package.json')); package.version = '$BUMPED_VERSION'; fs.writeFileSync('./package.json', JSON.stringify(package, null, '\t') + '\n');" + git commit . -m "🔖 @huggingface/splitmix64-wasm $BUMPED_VERSION" + git tag "splitmix64-wasm-v$BUMPED_VERSION" + - run: pnpm publish --no-git-checks . + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} + - run: (git pull --rebase && git push --follow-tags) || (git pull --rebase && git push --follow-tags) + # hack - reuse actions/setup-node@v3 just to set a new registry + - uses: actions/setup-node@v3 + with: + node-version: "20" + registry-url: "https://npm.pkg.github.com" + # Disable for now, until github supports PATs for writing github packages (https://github.com/github/roadmap/issues/558) + # - run: pnpm publish --no-git-checks . + # env: + # NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/xetchunk-wasm-publish.yml b/.github/workflows/xetchunk-wasm-publish.yml new file mode 100644 index 0000000000..d56da18b9c --- /dev/null +++ b/.github/workflows/xetchunk-wasm-publish.yml @@ -0,0 +1,64 @@ +name: Xetchunk WASM - Version and Release + +on: + workflow_dispatch: + inputs: + newversion: + type: choice + description: "Semantic Version Bump Type" + default: patch + options: + - patch + - minor + - major + +concurrency: + group: "push-to-main" + +defaults: + run: + working-directory: packages/xetchunk-wasm + +jobs: + version_and_release: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + # Needed to push the tag and the commit on the main branch, otherwise we get: + # > Run git push --follow-tags + # remote: error: GH006: Protected branch update failed for refs/heads/main. + # remote: error: Changes must be made through a pull request. Required status check "lint" is expected. + token: ${{ secrets.BOT_ACCESS_TOKEN }} + - run: npm install -g corepack@latest && corepack enable + - uses: actions/setup-node@v3 + with: + node-version: "20" + cache: "pnpm" + cache-dependency-path: | + packages/xetchunk-wasm/pnpm-lock.yaml + # setting a registry enables the NODE_AUTH_TOKEN env variable where we can set an npm token. REQUIRED + registry-url: "https://registry.npmjs.org" + - run: pnpm install + - run: git config --global user.name machineuser + - run: git config --global user.email infra+machineuser@huggingface.co + - run: | + PACKAGE_VERSION=$(node -p "require('./package.json').version") + BUMPED_VERSION=$(node -p "require('semver').inc('$PACKAGE_VERSION', '${{ github.event.inputs.newversion }}')") + # Update package.json with the new version + node -e "const fs = require('fs'); const package = JSON.parse(fs.readFileSync('./package.json')); package.version = '$BUMPED_VERSION'; fs.writeFileSync('./package.json', JSON.stringify(package, null, '\t') + '\n');" + git commit . -m "🔖 @huggingface/xetchunk-wasm $BUMPED_VERSION" + git tag "xetchunk-wasm-v$BUMPED_VERSION" + - run: pnpm publish --no-git-checks . + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} + - run: (git pull --rebase && git push --follow-tags) || (git pull --rebase && git push --follow-tags) + # hack - reuse actions/setup-node@v3 just to set a new registry + - uses: actions/setup-node@v3 + with: + node-version: "20" + registry-url: "https://npm.pkg.github.com" + # Disable for now, until github supports PATs for writing github packages (https://github.com/github/roadmap/issues/558) + # - run: pnpm publish --no-git-checks . + # env: + # NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/packages/blake3-wasm/LICENSE_A2 b/packages/blake3-wasm/LICENSE_A2 new file mode 100644 index 0000000000..2cdf43fa3e --- /dev/null +++ b/packages/blake3-wasm/LICENSE_A2 @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/packages/blake3-wasm/LICENSE_A2LLVM b/packages/blake3-wasm/LICENSE_A2LLVM new file mode 100644 index 0000000000..2cdf43fa3e --- /dev/null +++ b/packages/blake3-wasm/LICENSE_A2LLVM @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/packages/blake3-wasm/LICENSE_CC0 b/packages/blake3-wasm/LICENSE_CC0 new file mode 100644 index 0000000000..1625c17936 --- /dev/null +++ b/packages/blake3-wasm/LICENSE_CC0 @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. \ No newline at end of file diff --git a/packages/blake3-wasm/README.md b/packages/blake3-wasm/README.md new file mode 100644 index 0000000000..b971608b36 --- /dev/null +++ b/packages/blake3-wasm/README.md @@ -0,0 +1,33 @@ +JS and WASM implementations of https://github.com/BLAKE3-team/BLAKE3 + +Using [AssemblyScript](https://www.assemblyscript.org/) to generate a lean WASM. + +## Usage + +```javascript +import { blake3, blake3Hex, createHasher, update, finalize } from '@huggingface/gearhash-wasm'; + +// Create a Uint8Array of data to search through +const data = new Uint8Array(1_000_000); // Example: 1MB of data +// ... fill data with your content ... + +const hashUint8 = blake3(data); +const hashHex = blake3Hex(data); + +// Or streaming fashion +const hasher = createHasher(); + +for (const chunk of dataSource) { + hasher.update(chunk); +} + +const hash = hasher.finalize(); + +// When passing custom key +const hashKeyed = blake3Keyed(data, new Uint8Array([ + 0,1,2,3,4,5,6,7,8, + 9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23, + 24,25,26,27,28,29,30,31 +])); +``` \ No newline at end of file diff --git a/packages/blake3-wasm/asconfig.json b/packages/blake3-wasm/asconfig.json new file mode 100644 index 0000000000..8776597856 --- /dev/null +++ b/packages/blake3-wasm/asconfig.json @@ -0,0 +1,22 @@ +{ + "targets": { + "debug": { + "outFile": "build/debug.wasm", + "textFile": "build/debug.wat", + "sourceMap": true, + "debug": true + }, + "release": { + "outFile": "build/release.wasm", + "textFile": "build/release.wat", + "sourceMap": true, + "optimizeLevel": 3, + "shrinkLevel": 0, + "converge": false, + "noAssert": false + } + }, + "options": { + "bindings": "esm" + } +} \ No newline at end of file diff --git a/packages/blake3-wasm/assembly/blake3.ts b/packages/blake3-wasm/assembly/blake3.ts new file mode 100644 index 0000000000..5091750dc7 --- /dev/null +++ b/packages/blake3-wasm/assembly/blake3.ts @@ -0,0 +1,425 @@ +// Constants from the reference implementation +const OUT_LEN: i32 = 32; +// const KEY_LEN: usize = 32; +const BLOCK_LEN: i32 = 64; +const CHUNK_LEN: i32 = 1024; + +const CHUNK_START: u32 = 1 << 0; +const CHUNK_END: u32 = 1 << 1; +const PARENT: u32 = 1 << 2; +const ROOT: u32 = 1 << 3; +const KEYED_HASH: u32 = 1 << 4; +//const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +// const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +const IV: StaticArray = [ + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19, +]; + +const MSG_PERMUTATION: StaticArray = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + +// The mixing function, G, which mixes either a column or a diagonal. +function g(state: StaticArray, a: i32, b: i32, c: i32, d: i32, mx: u32, my: u32): void { + state[a] = state[a] + state[b] + mx; + state[d] = rotr(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + my; + state[d] = rotr(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr(state[b] ^ state[c], 7); +} + +function round(state: StaticArray, m: StaticArray): void { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +function permute(m: StaticArray): void { + const permuted = new StaticArray(16); + for (let i = 0; i < 16; i++) { + permuted[i] = m[MSG_PERMUTATION[i]]; + } + for (let i = 0; i < 16; i++) { + m[i] = permuted[i]; + } +} + +function compress( + chaining_value: StaticArray, + block_words: StaticArray, + counter: u64, + block_len: u32, + flags: u32 +): StaticArray { + const counter_low = counter as u32; + const counter_high = (counter >> 32) as u32; + const state = new StaticArray(16); + + // Initialize state + for (let i = 0; i < 8; i++) { + state[i] = chaining_value[i]; + state[i + 8] = IV[i]; + } + state[12] = counter_low; + state[13] = counter_high; + state[14] = block_len; + state[15] = flags; + + const block = new StaticArray(16); + for (let i = 0; i < 16; i++) { + block[i] = block_words[i]; + } + + // Apply rounds + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + + // Final mixing + for (let i = 0; i < 8; i++) { + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } + + return state; +} + +function words_from_little_endian_bytes(bytes: StaticArray, words: StaticArray): void { + for (let i = 0; i < words.length; i++) { + const offset = i * 4; + words[i] = + bytes[offset] | + ((bytes[offset + 1] as u32) << 8) | + ((bytes[offset + 2] as u32) << 16) | + ((bytes[offset + 3] as u32) << 24); + } +} + +class Blake3Hasher { + private chunk_state: ChunkState; + private key_words: StaticArray; + private cv_stack: StaticArray>; + private cv_stack_len: u8; + private flags: u32; + + constructor(key_words: StaticArray = [IV[0], IV[1], IV[2], IV[3], IV[4], IV[5], IV[6], IV[7]], flags: u32 = 0) { + this.key_words = key_words; + this.chunk_state = new ChunkState(key_words, 0, flags); + this.cv_stack = new StaticArray>(54); + this.cv_stack_len = 0; + this.flags = flags; + + for (let i = 0; i < 54; i++) { + this.cv_stack[i] = new StaticArray(8); + } + } + + // Constructor for keyed hash + static newKeyed(key: Uint8Array): Blake3Hasher { + if (key.length != 32) { + throw new Error("Key must be exactly 32 bytes"); + } + + const key_words = new StaticArray(8); + // const key_static = new StaticArray(32); + // for (let i = 0; i < 32; i++) { + // key_static[i] = key[i]; + // } + // words_from_little_endian_bytes(key_static, key_words); + const dataView = new DataView(key.buffer); + for (let i = 0; i < 8; i++) { + key_words[i] = dataView.getUint32(i * 4, true); + } + + return new Blake3Hasher(key_words, KEYED_HASH); + } + + update(input: Uint8Array): void { + let inputPos = 0; + while (inputPos < input.length) { + if (this.chunk_state.len() == CHUNK_LEN) { + const chunk_cv = this.chunk_state.output().chaining_value(); + const total_chunks = this.chunk_state.chunk_counter + 1; + this.add_chunk_chaining_value(chunk_cv, total_chunks); + this.chunk_state = new ChunkState(this.key_words, total_chunks, this.flags); + } + + const want = CHUNK_LEN - this.chunk_state.len(); + const take = min(want, input.length - inputPos); + this.chunk_state.update(input.subarray(inputPos, inputPos + take)); + inputPos += take; + } + } + + finalize(out: Uint8Array): void { + let output = this.chunk_state.output(); + let parent_nodes_remaining = this.cv_stack_len; + + while (parent_nodes_remaining > 0) { + parent_nodes_remaining--; + output = parent_output( + this.cv_stack[parent_nodes_remaining], + output.chaining_value(), + this.key_words, + this.flags + ); + } + + output.root_output_bytes(out); + } + + private add_chunk_chaining_value(new_cv: StaticArray, total_chunks: u64): void { + let mut_new_cv = new_cv; + let mut_total_chunks = total_chunks; + + while ((mut_total_chunks & 1) == 0) { + mut_new_cv = parent_cv(this.pop_stack(), mut_new_cv, this.key_words, this.flags); + mut_total_chunks >>= 1; + } + + this.push_stack(mut_new_cv); + } + + private push_stack(cv: StaticArray): void { + for (let i = 0; i < 8; i++) { + this.cv_stack[this.cv_stack_len][i] = cv[i]; + } + this.cv_stack_len++; + } + + private pop_stack(): StaticArray { + this.cv_stack_len--; + return this.cv_stack[this.cv_stack_len]; + } +} + +class ChunkState { + chaining_value: StaticArray; + chunk_counter: u64; + block: StaticArray; + block_len: u8; + blocks_compressed: u8; + flags: u32; + + constructor(key_words: StaticArray, chunk_counter: u64, flags: u32) { + this.chaining_value = new StaticArray(8); + this.chunk_counter = chunk_counter; + this.block = new StaticArray(BLOCK_LEN); + this.block_len = 0; + this.blocks_compressed = 0; + this.flags = flags; + + for (let i = 0; i < 8; i++) { + this.chaining_value[i] = key_words[i]; + } + } + + len(): i32 { + return BLOCK_LEN * this.blocks_compressed + this.block_len; + } + + start_flag(): u32 { + return this.blocks_compressed == 0 ? CHUNK_START : 0; + } + + update(input: Uint8Array): void { + let inputPos = 0; + while (inputPos < input.length) { + if (this.block_len == BLOCK_LEN) { + const block_words = new StaticArray(16); + words_from_little_endian_bytes(this.block, block_words); + const compressed = compress( + this.chaining_value, + block_words, + this.chunk_counter, + BLOCK_LEN, + this.flags | this.start_flag() + ); + for (let i = 0; i < 8; i++) { + this.chaining_value[i] = compressed[i]; + } + this.blocks_compressed++; + this.block = new StaticArray(BLOCK_LEN); + this.block_len = 0; + } + + const want = BLOCK_LEN - this.block_len; + const take = min(want, input.length - inputPos); + for (let i = 0; i < take; i++) { + this.block[this.block_len + i] = input[inputPos + i]; + } + this.block_len += take as u8; + inputPos += take; + } + } + + output(): Output { + const block_words = new StaticArray(16); + words_from_little_endian_bytes(this.block, block_words); + return new Output( + this.chaining_value, + block_words, + this.chunk_counter, + this.block_len, + this.flags | this.start_flag() | CHUNK_END + ); + } +} + +class Output { + input_chaining_value: StaticArray; + block_words: StaticArray; + counter: u64; + block_len: u32; + flags: u32; + + constructor( + input_chaining_value: StaticArray, + block_words: StaticArray, + counter: u64, + block_len: u32, + flags: u32 + ) { + this.input_chaining_value = input_chaining_value; + this.block_words = block_words; + this.counter = counter; + this.block_len = block_len; + this.flags = flags; + } + + chaining_value(): StaticArray { + const compressed = compress(this.input_chaining_value, this.block_words, this.counter, this.block_len, this.flags); + const result = new StaticArray(8); + for (let i = 0; i < 8; i++) { + result[i] = compressed[i]; + } + return result; + } + + root_output_bytes(out: Uint8Array): void { + let output_block_counter: u64 = 0; + for (let i = 0; i < out.length; i += 2 * OUT_LEN) { + const words = compress( + this.input_chaining_value, + this.block_words, + output_block_counter, + this.block_len, + this.flags | ROOT + ); + const out_block = out.subarray(i, i + 2 * OUT_LEN); + for (let j = 0; j < words.length; j++) { + const word = words[j]; + const offset = j * 4; + if (offset < out_block.length) { + out_block[offset] = word & 0xff; + if (offset + 1 < out_block.length) { + out_block[offset + 1] = (word >> 8) & 0xff; + if (offset + 2 < out_block.length) { + out_block[offset + 2] = (word >> 16) & 0xff; + if (offset + 3 < out_block.length) { + out_block[offset + 3] = (word >> 24) & 0xff; + } + } + } + } + } + output_block_counter++; + } + } +} + +function parent_output( + left_child_cv: StaticArray, + right_child_cv: StaticArray, + key_words: StaticArray, + flags: u32 +): Output { + const block_words = new StaticArray(16); + for (let i = 0; i < 8; i++) { + block_words[i] = left_child_cv[i]; + block_words[i + 8] = right_child_cv[i]; + } + return new Output(key_words, block_words, 0, BLOCK_LEN, PARENT | flags); +} + +function parent_cv( + left_child_cv: StaticArray, + right_child_cv: StaticArray, + key_words: StaticArray, + flags: u32 +): StaticArray { + return parent_output(left_child_cv, right_child_cv, key_words, flags).chaining_value(); +} + +export function blake3(input: Uint8Array): Uint8Array { + const hasher = new Blake3Hasher(); + hasher.update(input); + const output = new Uint8Array(32); + hasher.finalize(output); + return output; +} + +export function blake3Hex(input: Uint8Array): string { + const hash = blake3(input); + const hex = new Array(64); + for (let i = 0; i < 32; i++) { + hex[i * 2] = (hash[i] >> 4).toString(16); + hex[i * 2 + 1] = (hash[i] & 0x0f).toString(16); + } + return hex.join(""); +} + +export function blake3Keyed(input: Uint8Array, key: Uint8Array): Uint8Array { + const hasher = Blake3Hasher.newKeyed(key); + hasher.update(input); + const output = new Uint8Array(32); + hasher.finalize(output); + return output; +} + +export function blake3KeyedHex(input: Uint8Array, key: Uint8Array): string { + const hash = blake3Keyed(input, key); + const hex = new Array(64); + for (let i = 0; i < 32; i++) { + hex[i * 2] = (hash[i] >> 4).toString(16); + hex[i * 2 + 1] = (hash[i] & 0x0f).toString(16); + } + return hex.join(""); +} + +export function createHasher(): Blake3Hasher { + return new Blake3Hasher(); +} + +export function createKeyedHasher(key: Uint8Array): Blake3Hasher { + return Blake3Hasher.newKeyed(key); +} + +export function update(hasher: Blake3Hasher, input: Uint8Array): void { + hasher.update(input); +} + +export function finalize(hasher: Blake3Hasher): Uint8Array { + const output = new Uint8Array(32); + hasher.finalize(output); + return output; +} diff --git a/packages/blake3-wasm/assembly/index.ts b/packages/blake3-wasm/assembly/index.ts new file mode 100644 index 0000000000..8183303929 --- /dev/null +++ b/packages/blake3-wasm/assembly/index.ts @@ -0,0 +1,2 @@ +// Re-export everything from blake3.ts +export * from "./blake3"; diff --git a/packages/blake3-wasm/assembly/tsconfig.json b/packages/blake3-wasm/assembly/tsconfig.json new file mode 100644 index 0000000000..8131d68a0a --- /dev/null +++ b/packages/blake3-wasm/assembly/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../node_modules/.pnpm/assemblyscript@0.27.36/node_modules/assemblyscript/std/assembly.json", + "include": ["./**/*.ts"] +} diff --git a/packages/blake3-wasm/build/.gitignore b/packages/blake3-wasm/build/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/blake3-wasm/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/blake3-wasm/package.json b/packages/blake3-wasm/package.json new file mode 100644 index 0000000000..2fd61e316f --- /dev/null +++ b/packages/blake3-wasm/package.json @@ -0,0 +1,48 @@ +{ + "name": "@huggingface/blake3-wasm", + "version": "0.0.3", + "scripts": { + "build:debug": "asc assembly/index.ts --target debug", + "build:release": "asc assembly/index.ts --target release", + "build": "pnpm run build:debug && npm run build:release", + "test": "vitest run", + "prepare": "pnpm run build" + }, + "keywords": [ + "blake3", + "assemblyscript", + "assembly", + "wasm" + ], + "type": "module", + "exports": { + ".": { + "import": "./build/release.js", + "types": "./build/release.d.ts" + }, + "./assembly": { + "import": "./assembly/index.ts" + }, + "./wasm": { + "import": "./build/release.wasm" + } + }, + "main": "./build/release.js", + "types": "./build/release.d.ts", + "files": [ + "build/release.js", + "build/release.d.ts", + "build/release.wasm", + "build/release.wat", + "build/release.wasm.map", + "LICENSE_A2", + "LICENSE_A2LLVM", + "LICENSE_C0", + "README.md", + "asconfig.json", + "assembly" + ], + "devDependencies": { + "assemblyscript": "0.27.36" + } +} diff --git a/packages/blake3-wasm/pnpm-lock.yaml b/packages/blake3-wasm/pnpm-lock.yaml new file mode 100644 index 0000000000..9d7ac0a92a --- /dev/null +++ b/packages/blake3-wasm/pnpm-lock.yaml @@ -0,0 +1,38 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + devDependencies: + assemblyscript: + specifier: 0.27.36 + version: 0.27.36 + +packages: + + assemblyscript@0.27.36: + resolution: {integrity: sha512-1qX2zf6p7l/mNYv8r21jC/Yft7kX7XKR3xUHw41zvV4xad5lyC8w7jZiwZBGoy64VKZLc+bTDJDWi8Kb70YrHA==} + engines: {node: '>=18', npm: '>=10'} + hasBin: true + + binaryen@116.0.0-nightly.20240114: + resolution: {integrity: sha512-0GZrojJnuhoe+hiwji7QFaL3tBlJoA+KFUN7ouYSDGZLSo9CKM8swQX8n/UcbR0d1VuZKU+nhogNzv423JEu5A==} + hasBin: true + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + +snapshots: + + assemblyscript@0.27.36: + dependencies: + binaryen: 116.0.0-nightly.20240114 + long: 5.3.2 + + binaryen@116.0.0-nightly.20240114: {} + + long@5.3.2: {} diff --git a/packages/blake3-wasm/tests/index.test.ts b/packages/blake3-wasm/tests/index.test.ts new file mode 100644 index 0000000000..6f4550d893 --- /dev/null +++ b/packages/blake3-wasm/tests/index.test.ts @@ -0,0 +1,208 @@ +// Adapted from https://github.com/mcmilk/BLAKE3-tests/blob/11a8abeceac93b5eba664eae3679efb4ffa5bc0a/blake3_test.c + +import { describe, expect } from "vitest"; +import { blake3Hex, blake3KeyedHex } from "../build/debug.js"; +import { it } from "vitest"; + +const buffer = new Uint8Array(102400); +let i = 0; +let j = 0; + +for (i = 0, j = 0; i < buffer.length; i++, j++) { + if (j === 251) { + j = 0; + } + buffer[i] = j; +} + +function uint8ArrayFromString(str: string) { + const arr = new Uint8Array(str.length); + for (let i = 0; i < str.length; i++) { + arr[i] = str.charCodeAt(i); + } + return arr; +} + +const key = uint8ArrayFromString("whats the Elvish word for friend"); + +const testCases = [ + { + buf: buffer.slice(0, 0), + expected: "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262", + keyed: "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26", + }, + { + buf: buffer.slice(0, 1), + expected: "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213", + keyed: "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b", + }, + { + buf: buffer.slice(0, 2), + expected: "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63", + keyed: "5392ddae0e0a69d5f40160462cbd9bd889375082ff224ac9c758802b7a6fd20a", + }, + { + buf: buffer.slice(0, 3), + expected: "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f", + keyed: "39e67b76b5a007d4921969779fe666da67b5213b096084ab674742f0d5ec62b9", + }, + { + buf: buffer.slice(0, 4), + expected: "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f32", + keyed: "7671dde590c95d5ac9616651ff5aa0a27bee5913a348e053b8aa9108917fe070", + }, + { + buf: buffer.slice(0, 5), + expected: "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2", + keyed: "73ac69eecf286894d8102018a6fc729f4b1f4247d3703f69bdc6a5fe3e0c8461", + }, + { + buf: buffer.slice(0, 6), + expected: "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c844", + keyed: "82d3199d0013035682cc7f2a399d4c212544376a839aa863a0f4c91220ca7a6d", + }, + { + buf: buffer.slice(0, 7), + expected: "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe", + keyed: "af0a7ec382aedc0cfd626e49e7628bc7a353a4cb108855541a5651bf64fbb28a", + }, + { + buf: buffer.slice(0, 8), + expected: "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb", + keyed: "be2f5495c61cba1bb348a34948c004045e3bd4dae8f0fe82bf44d0da245a0600", + }, + { + buf: buffer.slice(0, 63), + expected: "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b", + keyed: "bb1eb5d4afa793c1ebdd9fb08def6c36d10096986ae0cfe148cd101170ce37ae", + }, + { + buf: buffer.slice(0, 64), + expected: "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98", + keyed: "ba8ced36f327700d213f120b1a207a3b8c04330528586f414d09f2f7d9ccb7e6", + }, + { + buf: buffer.slice(0, 65), + expected: "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee", + keyed: "c0a4edefa2d2accb9277c371ac12fcdbb52988a86edc54f0716e1591b4326e72", + }, + { + buf: buffer.slice(0, 127), + expected: "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640d", + keyed: "c64200ae7dfaf35577ac5a9521c47863fb71514a3bcad18819218b818de85818", + }, + { + buf: buffer.slice(0, 128), + expected: "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45ef", + keyed: "b04fe15577457267ff3b6f3c947d93be581e7e3a4b018679125eaf86f6a628ec", + }, + { + buf: buffer.slice(0, 129), + expected: "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12", + keyed: "d4a64dae6cdccbac1e5287f54f17c5f985105457c1a2ec1878ebd4b57e20d38f", + }, + { + buf: buffer.slice(0, 1023), + expected: "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11", + keyed: "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e", + }, + { + buf: buffer.slice(0, 1024), + expected: "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af7", + keyed: "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4", + }, + { + buf: buffer.slice(0, 1025), + expected: "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444", + keyed: "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69", + }, + { + buf: buffer.slice(0, 2048), + expected: "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a", + keyed: "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd1", + }, + { + buf: buffer.slice(0, 2049), + expected: "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b6879522563030", + keyed: "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5", + }, + { + buf: buffer.slice(0, 3072), + expected: "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd2", + keyed: "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df191770", + }, + { + buf: buffer.slice(0, 3073), + expected: "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd3", + keyed: "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a", + }, + { + buf: buffer.slice(0, 4096), + expected: "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e969", + keyed: "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0", + }, + { + buf: buffer.slice(0, 4097), + expected: "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb995", + keyed: "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc", + }, + { + buf: buffer.slice(0, 5120), + expected: "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833", + keyed: "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e20", + }, + { + buf: buffer.slice(0, 5121), + expected: "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff", + keyed: "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d024", + }, + { + buf: buffer.slice(0, 6144), + expected: "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca205", + keyed: "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1d", + }, + { + buf: buffer.slice(0, 6145), + expected: "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f", + keyed: "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539", + }, + { + buf: buffer.slice(0, 7168), + expected: "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a", + keyed: "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fc", + }, + { + buf: buffer.slice(0, 7169), + expected: "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e7817", + keyed: "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa511", + }, + { + buf: buffer.slice(0, 8192), + expected: "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a63", + keyed: "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a", + }, + { + buf: buffer.slice(0, 8193), + expected: "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3b", + keyed: "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5", + }, + { + buf: buffer.slice(0, 102400), + expected: "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085", + keyed: "1c35d1a5811083fd7119f5d5d1ba027b4d01c0c6c49fb6ff2cf75393ea5db4a7", + }, +]; + +describe("blake3", () => { + describe("BLAKE3_TESTS", () => { + for (const testCase of testCases) { + it(`should pass ${testCase.buf.length} bytes`, () => { + const result = blake3Hex(testCase.buf); + expect(result).toBe(testCase.expected); + + const resultKeyed = blake3KeyedHex(testCase.buf, key); + expect(resultKeyed).toBe(testCase.keyed); + }); + } + }); +}); diff --git a/packages/blake3-wasm/vendor/Cargo.lock b/packages/blake3-wasm/vendor/Cargo.lock new file mode 100644 index 0000000000..9f0162bf75 --- /dev/null +++ b/packages/blake3-wasm/vendor/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "blake3-example" +version = "0.1.0" diff --git a/packages/blake3-wasm/vendor/Cargo.toml b/packages/blake3-wasm/vendor/Cargo.toml new file mode 100644 index 0000000000..7f31968ed3 --- /dev/null +++ b/packages/blake3-wasm/vendor/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "blake3-example" +version = "0.1.0" +edition = "2021" + +[lib] +name = "reference_impl" +path = "src/lib.rs" + +[[bin]] +name = "blake3-example" +path = "src/main.rs" \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/README.md b/packages/blake3-wasm/vendor/README.md new file mode 100644 index 0000000000..46cce0d076 --- /dev/null +++ b/packages/blake3-wasm/vendor/README.md @@ -0,0 +1,27 @@ +# BLAKE3 Example + +This is a simple example that demonstrates using the BLAKE3 hash function with empty input. + +## Prerequisites + +- Rust and Cargo installed on your system. You can install them from [rustup.rs](https://rustup.rs/) + +## Running the Example + +1. Open a terminal in this directory +2. Run the following command: + ```bash + cargo run + ``` + +The program will output a 32-byte hash in hexadecimal format. For empty input, the expected output should be: +``` +af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 +``` + +## What the Code Does + +1. Creates a new BLAKE3 hasher +2. Updates it with empty input +3. Finalizes the hash into a 32-byte buffer +4. Prints the hash in hexadecimal format \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/src/blake3.rs b/packages/blake3-wasm/vendor/src/blake3.rs new file mode 100644 index 0000000000..bc701784f8 --- /dev/null +++ b/packages/blake3-wasm/vendor/src/blake3.rs @@ -0,0 +1,376 @@ +// From https://github.com/BLAKE3-team/BLAKE3/blob/master/reference_impl/reference_impl.rs + +//! This is the reference implementation of BLAKE3. It is used for testing and +//! as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 +//! spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) +//! discusses this implementation. You can render docs for this implementation +//! by running `cargo doc --open` in this directory. +//! +//! # Example +//! +//! ``` +//! let mut hasher = reference_impl::Hasher::new(); +//! hasher.update(b"abc"); +//! hasher.update(b"def"); +//! let mut hash = [0; 32]; +//! hasher.finalize(&mut hash); +//! let mut extended_hash = [0; 500]; +//! hasher.finalize(&mut extended_hash); +//! assert_eq!(hash, extended_hash[..32]); +//! ``` + +use core::cmp::min; + +const OUT_LEN: usize = 32; +const KEY_LEN: usize = 32; +const BLOCK_LEN: usize = 64; +const CHUNK_LEN: usize = 1024; + +const CHUNK_START: u32 = 1 << 0; +const CHUNK_END: u32 = 1 << 1; +const PARENT: u32 = 1 << 2; +const ROOT: u32 = 1 << 3; +const KEYED_HASH: u32 = 1 << 4; +const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +const IV: [u32; 8] = [ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; + +const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + +// The mixing function, G, which mixes either a column or a diagonal. +fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(mx); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(my); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +fn round(state: &mut [u32; 16], m: &[u32; 16]) { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +fn permute(m: &mut [u32; 16]) { + let mut permuted = [0; 16]; + for i in 0..16 { + permuted[i] = m[MSG_PERMUTATION[i]]; + } + *m = permuted; +} + +fn compress( + chaining_value: &[u32; 8], + block_words: &[u32; 16], + counter: u64, + block_len: u32, + flags: u32, +) -> [u32; 16] { + let counter_low = counter as u32; + let counter_high = (counter >> 32) as u32; + #[rustfmt::skip] + let mut state = [ + chaining_value[0], chaining_value[1], chaining_value[2], chaining_value[3], + chaining_value[4], chaining_value[5], chaining_value[6], chaining_value[7], + IV[0], IV[1], IV[2], IV[3], + counter_low, counter_high, block_len, flags, + ]; + let mut block = *block_words; + + round(&mut state, &block); // round 1 + permute(&mut block); + round(&mut state, &block); // round 2 + permute(&mut block); + round(&mut state, &block); // round 3 + permute(&mut block); + round(&mut state, &block); // round 4 + permute(&mut block); + round(&mut state, &block); // round 5 + permute(&mut block); + round(&mut state, &block); // round 6 + permute(&mut block); + round(&mut state, &block); // round 7 + + for i in 0..8 { + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } + state +} + +fn first_8_words(compression_output: [u32; 16]) -> [u32; 8] { + compression_output[0..8].try_into().unwrap() +} + +fn words_from_little_endian_bytes(bytes: &[u8], words: &mut [u32]) { + debug_assert_eq!(bytes.len(), 4 * words.len()); + for (four_bytes, word) in bytes.chunks_exact(4).zip(words) { + *word = u32::from_le_bytes(four_bytes.try_into().unwrap()); + } +} + +// Each chunk or parent node can produce either an 8-word chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +struct Output { + input_chaining_value: [u32; 8], + block_words: [u32; 16], + counter: u64, + block_len: u32, + flags: u32, +} + +impl Output { + fn chaining_value(&self) -> [u32; 8] { + first_8_words(compress( + &self.input_chaining_value, + &self.block_words, + self.counter, + self.block_len, + self.flags, + )) + } + + fn root_output_bytes(&self, out_slice: &mut [u8]) { + let mut output_block_counter = 0; + for out_block in out_slice.chunks_mut(2 * OUT_LEN) { + let words = compress( + &self.input_chaining_value, + &self.block_words, + output_block_counter, + self.block_len, + self.flags | ROOT, + ); + // The output length might not be a multiple of 4. + for (word, out_word) in words.iter().zip(out_block.chunks_mut(4)) { + out_word.copy_from_slice(&word.to_le_bytes()[..out_word.len()]); + } + output_block_counter += 1; + } + } +} + +struct ChunkState { + chaining_value: [u32; 8], + chunk_counter: u64, + block: [u8; BLOCK_LEN], + block_len: u8, + blocks_compressed: u8, + flags: u32, +} + +impl ChunkState { + fn new(key_words: [u32; 8], chunk_counter: u64, flags: u32) -> Self { + Self { + chaining_value: key_words, + chunk_counter, + block: [0; BLOCK_LEN], + block_len: 0, + blocks_compressed: 0, + flags, + } + } + + fn len(&self) -> usize { + BLOCK_LEN * self.blocks_compressed as usize + self.block_len as usize + } + + fn start_flag(&self) -> u32 { + if self.blocks_compressed == 0 { + CHUNK_START + } else { + 0 + } + } + + fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the block buffer is full, compress it and clear it. More + // input is coming, so this compression is not CHUNK_END. + if self.block_len as usize == BLOCK_LEN { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + self.chaining_value = first_8_words(compress( + &self.chaining_value, + &block_words, + self.chunk_counter, + BLOCK_LEN as u32, + self.flags | self.start_flag(), + )); + self.blocks_compressed += 1; + self.block = [0; BLOCK_LEN]; + self.block_len = 0; + } + + // Copy input bytes into the block buffer. + let want = BLOCK_LEN - self.block_len as usize; + let take = min(want, input.len()); + self.block[self.block_len as usize..][..take].copy_from_slice(&input[..take]); + self.block_len += take as u8; + input = &input[take..]; + } + } + + fn output(&self) -> Output { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + Output { + input_chaining_value: self.chaining_value, + block_words, + counter: self.chunk_counter, + block_len: self.block_len as u32, + flags: self.flags | self.start_flag() | CHUNK_END, + } + } +} + +fn parent_output( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key_words: [u32; 8], + flags: u32, +) -> Output { + let mut block_words = [0; 16]; + block_words[..8].copy_from_slice(&left_child_cv); + block_words[8..].copy_from_slice(&right_child_cv); + Output { + input_chaining_value: key_words, + block_words, + counter: 0, // Always 0 for parent nodes. + block_len: BLOCK_LEN as u32, // Always BLOCK_LEN (64) for parent nodes. + flags: PARENT | flags, + } +} + +fn parent_cv( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key_words: [u32; 8], + flags: u32, +) -> [u32; 8] { + parent_output(left_child_cv, right_child_cv, key_words, flags).chaining_value() +} + +/// An incremental hasher that can accept any number of writes. +pub struct Hasher { + chunk_state: ChunkState, + key_words: [u32; 8], + cv_stack: [[u32; 8]; 54], // Space for 54 subtree chaining values: + cv_stack_len: u8, // 2^54 * CHUNK_LEN = 2^64 + flags: u32, +} + +impl Hasher { + fn new_internal(key_words: [u32; 8], flags: u32) -> Self { + Self { + chunk_state: ChunkState::new(key_words, 0, flags), + key_words, + cv_stack: [[0; 8]; 54], + cv_stack_len: 0, + flags, + } + } + + /// Construct a new `Hasher` for the regular hash function. + pub fn new() -> Self { + Self::new_internal(IV, 0) + } + + /// Construct a new `Hasher` for the keyed hash function. + pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { + let mut key_words = [0; 8]; + words_from_little_endian_bytes(key, &mut key_words); + Self::new_internal(key_words, KEYED_HASH) + } + + /// Construct a new `Hasher` for the key derivation function. The context + /// string should be hardcoded, globally unique, and application-specific. + pub fn new_derive_key(context: &str) -> Self { + let mut context_hasher = Self::new_internal(IV, DERIVE_KEY_CONTEXT); + context_hasher.update(context.as_bytes()); + let mut context_key = [0; KEY_LEN]; + context_hasher.finalize(&mut context_key); + let mut context_key_words = [0; 8]; + words_from_little_endian_bytes(&context_key, &mut context_key_words); + Self::new_internal(context_key_words, DERIVE_KEY_MATERIAL) + } + + fn push_stack(&mut self, cv: [u32; 8]) { + self.cv_stack[self.cv_stack_len as usize] = cv; + self.cv_stack_len += 1; + } + + fn pop_stack(&mut self) -> [u32; 8] { + self.cv_stack_len -= 1; + self.cv_stack[self.cv_stack_len as usize] + } + + // Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail. + fn add_chunk_chaining_value(&mut self, mut new_cv: [u32; 8], mut total_chunks: u64) { + // This chunk might complete some subtrees. For each completed subtree, + // its left child will be the current top entry in the CV stack, and + // its right child will be the current value of `new_cv`. Pop each left + // child off the stack, merge it with `new_cv`, and overwrite `new_cv` + // with the result. After all these merges, push the final value of + // `new_cv` onto the stack. The number of completed subtrees is given + // by the number of trailing 0-bits in the new total number of chunks. + while total_chunks & 1 == 0 { + new_cv = parent_cv(self.pop_stack(), new_cv, self.key_words, self.flags); + total_chunks >>= 1; + } + self.push_stack(new_cv); + } + + /// Add input to the hash state. This can be called any number of times. + pub fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the current chunk is complete, finalize it and reset the + // chunk state. More input is coming, so this chunk is not ROOT. + if self.chunk_state.len() == CHUNK_LEN { + let chunk_cv = self.chunk_state.output().chaining_value(); + let total_chunks = self.chunk_state.chunk_counter + 1; + self.add_chunk_chaining_value(chunk_cv, total_chunks); + self.chunk_state = ChunkState::new(self.key_words, total_chunks, self.flags); + } + + // Compress input bytes into the current chunk state. + let want = CHUNK_LEN - self.chunk_state.len(); + let take = min(want, input.len()); + self.chunk_state.update(&input[..take]); + input = &input[take..]; + } + } + + /// Finalize the hash and write any number of output bytes. + pub fn finalize(&self, out_slice: &mut [u8]) { + // Starting with the Output from the current chunk, compute all the + // parent chaining values along the right edge of the tree, until we + // have the root Output. + let mut output = self.chunk_state.output(); + let mut parent_nodes_remaining = self.cv_stack_len as usize; + while parent_nodes_remaining > 0 { + parent_nodes_remaining -= 1; + output = parent_output( + self.cv_stack[parent_nodes_remaining], + output.chaining_value(), + self.key_words, + self.flags, + ); + } + output.root_output_bytes(out_slice); + } +} diff --git a/packages/blake3-wasm/vendor/src/lib.rs b/packages/blake3-wasm/vendor/src/lib.rs new file mode 100644 index 0000000000..874b108ebf --- /dev/null +++ b/packages/blake3-wasm/vendor/src/lib.rs @@ -0,0 +1,3 @@ +mod blake3; + +pub use blake3::*; \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/src/main.rs b/packages/blake3-wasm/vendor/src/main.rs new file mode 100644 index 0000000000..76a1537cbd --- /dev/null +++ b/packages/blake3-wasm/vendor/src/main.rs @@ -0,0 +1,30 @@ +use std::io::Write; + +fn main() { + println!("Starting BLAKE3 hash computation for empty input"); + + // Create a new hasher + let mut hasher = reference_impl::Hasher::new(); + println!("Created new hasher"); + + // Update with empty input + let input = &[0u8, 1u8]; + println!("Input length: {} bytes", input.len()); + hasher.update(input); + println!("Updated hasher with input"); + + // Create a buffer for the output + let mut output = [0u8; 32]; + + // Get the hash + hasher.finalize(&mut output); + println!("Finalized hash computation"); + + // Print the hash in hex format + let mut stdout = std::io::stdout(); + print!("Final hash: "); + for byte in output { + write!(stdout, "{:02x}", byte).unwrap(); + } + println!(); +} \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/target/.gitignore b/packages/blake3-wasm/vendor/target/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/blake3-wasm/vendor/target/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/blake3-wasm/vitest.config.ts b/packages/blake3-wasm/vitest.config.ts new file mode 100644 index 0000000000..2fb5c48d93 --- /dev/null +++ b/packages/blake3-wasm/vitest.config.ts @@ -0,0 +1,8 @@ +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + globals: true, + environment: "node", + }, +}); diff --git a/packages/gearhash-wasm/README.md b/packages/gearhash-wasm/README.md new file mode 100644 index 0000000000..2c0de81843 --- /dev/null +++ b/packages/gearhash-wasm/README.md @@ -0,0 +1,85 @@ +JS and WASM implementations of https://github.com/srijs/rust-gearhash + +Using [AssemblyScript](https://www.assemblyscript.org/) to generate a lean WASM. + +## Usage + +```javascript +import { nextMatch } from '@huggingface/gearhash-wasm'; + +// Create a Uint8Array of data to search through +const data = new Uint8Array(1000000); // Example: 1MB of data +// ... fill data with your content ... + +const mask = 0x0000d90003530000n; // Example mask as a BigInt, more 1s in binary repr => bigger chunks +//^ it has 11 1s in binary, so chunks will be ~2048 long +const match = nextMatch(data, mask); +const allMatches = nextMatches(data, mask).matches; +``` + +The `nextMatch` function takes two parameters: +- `data`: A Uint8Array containing the data to search through +- `mask`: A BigInt, the more 1s it has in its binary representation, the bigger the chunk + +The function returns an object with the `position` (i32) and `hash` (u64) properties + +You can continuously feed data like this: + +```javascript +let hash = 0n; +const mask = 0x0000d90003530000n; + +let length = 0; // extra length not processed +for await (const chunk of dataSource) { + let index = 0; + while (1) { + let match = nextMatch(chunk.subArray(index), mask, hash); + + if (match.position !== -1) { + console.log({ + length: match.position + length, + hash: match.hash + }) + + index += match.position; + length = 0; + hash = 0n; + } else { + length += chunk.length - index; + break; + } + } +} + +console.log(length, "bytes without a match, ending hash: ", hash); +``` + +or, more performant with `nextMatches`: + +```javascript +let hash = 0n; +const mask = 0x0000d90003530000n; + +let length = 0; +for await (const chunk of dataSource) { + const result = nextMatches(chunk, mask, hash); + let lastPosition = 0; + for (const match of result.matches) { + console.log({ + length: match.position - lastPosition + length, + hash: match.hash + }); + + length = 0; + lastPosition = match.position; + } + length = result.remaining; + hash = result.hash; +} + +console.log(length, "bytes without a match, ending hash: ", hash); +``` + +## Possible improvements + +SIMD \ No newline at end of file diff --git a/packages/gearhash-wasm/asconfig.json b/packages/gearhash-wasm/asconfig.json new file mode 100644 index 0000000000..8776597856 --- /dev/null +++ b/packages/gearhash-wasm/asconfig.json @@ -0,0 +1,22 @@ +{ + "targets": { + "debug": { + "outFile": "build/debug.wasm", + "textFile": "build/debug.wat", + "sourceMap": true, + "debug": true + }, + "release": { + "outFile": "build/release.wasm", + "textFile": "build/release.wat", + "sourceMap": true, + "optimizeLevel": 3, + "shrinkLevel": 0, + "converge": false, + "noAssert": false + } + }, + "options": { + "bindings": "esm" + } +} \ No newline at end of file diff --git a/packages/gearhash-wasm/assembly/index.ts b/packages/gearhash-wasm/assembly/index.ts new file mode 100644 index 0000000000..447e7776f7 --- /dev/null +++ b/packages/gearhash-wasm/assembly/index.ts @@ -0,0 +1,2 @@ +export { DEFAULT_TABLE } from "./table"; +export { nextMatch, nextMatches } from "./next-match"; diff --git a/packages/gearhash-wasm/assembly/next-match.ts b/packages/gearhash-wasm/assembly/next-match.ts new file mode 100644 index 0000000000..1093f77a80 --- /dev/null +++ b/packages/gearhash-wasm/assembly/next-match.ts @@ -0,0 +1,46 @@ +// The entry file of your WebAssembly module. + +import { DEFAULT_TABLE } from "./table"; + +// Interface for the match result +export class MatchResult { + position: i32 = -1; + hash: u64 = 0; +} + +// Function to find the next match in the buffer +export function nextMatch(buf: Uint8Array, mask: u64, hash: u64 = 0): MatchResult { + for (let i = 0; i < buf.length; i++) { + const b = buf[i]; + hash = (hash << 1) + DEFAULT_TABLE[b]; + + if ((hash & mask) == 0) { + return { position: i + 1, hash }; + } + } + + return { position: -1, hash }; // Return -1 position to indicate no match found, along with the final hash +} + +export class NextMatchesResult { + matches: MatchResult[] = []; + hash: u64 = 0; + remaining: i32 = 0; +} + +export function nextMatches(buf: Uint8Array, mask: u64, hash: u64 = 0): NextMatchesResult { + const result = new NextMatchesResult(); + + let match = nextMatch(buf, mask, hash); + let position = 0; + while (match.position !== -1) { + result.matches.push(match); + position += match.position; + match = nextMatch(buf.subarray(position), mask, 0); + } + + result.remaining = buf.length - position; + result.hash = match.hash; + + return result; +} diff --git a/packages/gearhash-wasm/assembly/table.ts b/packages/gearhash-wasm/assembly/table.ts new file mode 100644 index 0000000000..22a9e52df9 --- /dev/null +++ b/packages/gearhash-wasm/assembly/table.ts @@ -0,0 +1,57 @@ +/* eslint-disable @typescript-eslint/no-loss-of-precision */ + +// Define the Table type as a static array of u64 values +export const DEFAULT_TABLE: StaticArray = [ + 0xb088d3a9e840f559, 0x5652c7f739ed20d6, 0x45b28969898972ab, 0x6b0a89d5b68ec777, 0x368f573e8b7a31b7, + 0x1dc636dce936d94b, 0x207a4c4e5554d5b6, 0xa474b34628239acb, 0x3b06a83e1ca3b912, 0x90e78d6c2f02baf7, + 0xe1c92df7150d9a8a, 0x8e95053a1086d3ad, 0x5a2ef4f1b83a0722, 0xa50fac949f807fae, 0x0e7303eb80d8d681, + 0x99b07edc1570ad0f, 0x689d2fb555fd3076, 0x00005082119ea468, 0xc4b08306a88fcc28, 0x3eb0678af6374afd, + 0xf19f87ab86ad7436, 0xf2129fbfbe6bc736, 0x481149575c98a4ed, 0x0000010695477bc5, 0x1fba37801a9ceacc, + 0x3bf06fd663a49b6d, 0x99687e9782e3874b, 0x79a10673aa50d8e3, 0xe4accf9e6211f420, 0x2520e71f87579071, + 0x2bd5d3fd781a8a9b, 0x00de4dcddd11c873, 0xeaa9311c5a87392f, 0xdb748eb617bc40ff, 0xaf579a8df620bf6f, + 0x86a6e5da1b09c2b1, 0xcc2fc30ac322a12e, 0x355e2afec1f74267, 0x2d99c8f4c021a47b, 0xbade4b4a9404cfc3, + 0xf7b518721d707d69, 0x3286b6587bf32c20, 0x0000b68886af270c, 0xa115d6e4db8a9079, 0x484f7e9c97b2e199, + 0xccca7bb75713e301, 0xbf2584a62bb0f160, 0xade7e813625dbcc8, 0x000070940d87955a, 0x8ae69108139e626f, + 0xbd776ad72fde38a2, 0xfb6b001fc2fcc0cf, 0xc7a474b8e67bc427, 0xbaf6f11610eb5d58, 0x09cb1f5b6de770d1, + 0xb0b219e6977d4c47, 0x00ccbc386ea7ad4a, 0xcc849d0adf973f01, 0x73a3ef7d016af770, 0xc807d2d386bdbdfe, + 0x7f2ac9966c791730, 0xd037a86bc6c504da, 0xf3f17c661eaa609d, 0xaca626b04daae687, 0x755a99374f4a5b07, + 0x90837ee65b2caede, 0x6ee8ad93fd560785, 0x0000d9e11053edd8, 0x9e063bb2d21cdbd7, 0x07ab77f12a01d2b2, + 0xec550255e6641b44, 0x78fb94a8449c14c6, 0xc7510e1bc6c0f5f5, 0x0000320b36e4cae3, 0x827c33262c8b1a2d, + 0x14675f0b48ea4144, 0x267bd3a6498deceb, 0xf1916ff982f5035e, 0x86221b7ff434fb88, 0x9dbecee7386f49d8, + 0xea58f8cac80f8f4a, 0x008d198692fc64d8, 0x6d38704fbabf9a36, 0xe032cb07d1e7be4c, 0x228d21f6ad450890, + 0x635cb1bfc02589a5, 0x4620a1739ca2ce71, 0xa7e7dfe3aae5fb58, 0x0c10ca932b3c0deb, 0x2727fee884afed7b, + 0xa2df1c6df9e2ab1f, 0x4dcdd1ac0774f523, 0x000070ffad33e24e, 0xa2ace87bc5977816, 0x9892275ab4286049, + 0xc2861181ddf18959, 0xbb9972a042483e19, 0xef70cd3766513078, 0x00000513abfc9864, 0xc058b61858c94083, + 0x09e850859725e0de, 0x9197fb3bf83e7d94, 0x7e1e626d12b64bce, 0x520c54507f7b57d1, 0xbee1797174e22416, + 0x6fd9ac3222e95587, 0x0023957c9adfbf3e, 0xa01c7d7e234bbe15, 0xaba2c758b8a38cbb, 0x0d1fa0ceec3e2b30, + 0x0bb6a58b7e60b991, 0x4333dd5b9fa26635, 0xc2fd3b7d4001c1a3, 0xfb41802454731127, 0x65a56185a50d18cb, + 0xf67a02bd8784b54f, 0x696f11dd67e65063, 0x00002022fca814ab, 0x8cd6be912db9d852, 0x695189b6e9ae8a57, + 0xee9453b50ada0c28, 0xd8fc5ea91a78845e, 0xab86bf191a4aa767, 0x0000c6b5c86415e5, 0x267310178e08a22e, + 0xed2d101b078bca25, 0x3b41ed84b226a8fb, 0x13e622120f28dc06, 0xa315f5ebfb706d26, 0x8816c34e3301bace, + 0xe9395b9cbb71fdae, 0x002ce9202e721648, 0x4283db1d2bb3c91c, 0xd77d461ad2b1a6a5, 0xe2ec17e46eeb866b, + 0xb8e0be4039fbc47c, 0xdea160c4d5299d04, 0x7eec86c8d28c3634, 0x2119ad129f98a399, 0xa6ccf46b61a283ef, + 0x2c52cedef658c617, 0x2db4871169acdd83, 0x0000f0d6f39ecbe9, 0x3dd5d8c98d2f9489, 0x8a1872a22b01f584, + 0xf282a4c40e7b3cf2, 0x8020ec2ccb1ba196, 0x6693b6e09e59e313, 0x0000ce19cc7c83eb, 0x20cb5735f6479c3b, + 0x762ebf3759d75a5b, 0x207bfe823d693975, 0xd77dc112339cd9d5, 0x9ba7834284627d03, 0x217dc513e95f51e9, + 0xb27b1a29fc5e7816, 0x00d5cd9831bb662d, 0x71e39b806d75734c, 0x7e572af006fb1a23, 0xa2734f2f6ae91f85, + 0xbf82c6b5022cddf2, 0x5c3beac60761a0de, 0xcdc893bb47416998, 0x6d1085615c187e01, 0x77f8ae30ac277c5d, + 0x917c6b81122a2c91, 0x5b75b699add16967, 0x0000cf6ae79a069b, 0xf3c40afa60de1104, 0x2063127aa59167c3, + 0x621de62269d1894d, 0xd188ac1de62b4726, 0x107036e2154b673c, 0x0000b85f28553a1d, 0xf2ef4e4c18236f3d, + 0xd9d6de6611b9f602, 0xa1fc7955fb47911c, 0xeb85fd032f298dbd, 0xbe27502fb3befae1, 0xe3034251c4cd661e, + 0x441364d354071836, 0x0082b36c75f2983e, 0xb145910316fa66f0, 0x021c069c9847caf7, 0x2910dfc75a4b5221, + 0x735b353e1c57a8b5, 0xce44312ce98ed96c, 0xbc942e4506bdfa65, 0xf05086a71257941b, 0xfec3b215d351cead, + 0x00ae1055e0144202, 0xf54b40846f42e454, 0x00007fd9c8bcbcc8, 0xbfbd9ef317de9bfe, 0xa804302ff2854e12, + 0x39ce4957a5e5d8d4, 0xffb9e2a45637ba84, 0x55b9ad1d9ea0818b, 0x00008acbf319178a, 0x48e2bfc8d0fbfb38, + 0x8be39841e848b5e8, 0x0e2712160696a08b, 0xd51096e84b44242a, 0x1101ba176792e13a, 0xc22e770f4531689d, + 0x1689eff272bbc56c, 0x00a92a197f5650ec, 0xbc765990bda1784e, 0xc61441e392fcb8ae, 0x07e13a2ced31e4a0, + 0x92cbe984234e9d4d, 0x8f4ff572bb7d8ac5, 0x0b9670c00b963bd0, 0x62955a581a03eb01, 0x645f83e5ea000254, + 0x41fce516cd88f299, 0xbbda9748da7a98cf, 0x0000aab2fe4845fa, 0x19761b069bf56555, 0x8b8f5e8343b6ad56, + 0x3e5d1cfd144821d9, 0xec5c1e2ca2b0cd8f, 0xfaf7e0fea7fbb57f, 0x000000d3ba12961b, 0xda3f90178401b18e, + 0x70ff906de33a5feb, 0x0527d5a7c06970e7, 0x22d8e773607c13e9, 0xc9ab70df643c3bac, 0xeda4c6dc8abe12e3, + 0xecef1f410033e78a, 0x0024c2b274ac72cb, 0x06740d954fa900b4, 0x1d7a299b323d6304, 0xb3c37cb298cbead5, + 0xc986e3c76178739b, 0x9fabea364b46f58a, 0x6da214c5af85cc56, 0x17a43ed8b7a38f84, 0x6eccec511d9adbeb, + 0xf9cab30913335afb, 0x4a5e60c5f415eed2, 0x00006967503672b4, 0x9da51d121454bb87, 0x84321e13b9bbc816, + 0xfb3d6fb6ab2fdd8d, 0x60305eed8e160a8d, 0xcbbf4b14e9946ce8, 0x00004f63381b10c3, 0x07d5b7816fcc4e10, + 0xe5a536726a6a8155, 0x57afb23447a07fdd, 0x18f346f7abc9d394, 0x636dc655d61ad33d, 0xcc8bab4939f7f3f6, + 0x63c7a906c1dd187b, +]; diff --git a/packages/gearhash-wasm/assembly/tsconfig.json b/packages/gearhash-wasm/assembly/tsconfig.json new file mode 100644 index 0000000000..f81c3d55e6 --- /dev/null +++ b/packages/gearhash-wasm/assembly/tsconfig.json @@ -0,0 +1,6 @@ +{ + "extends": "../node_modules/.pnpm/assemblyscript@0.27.36/node_modules/assemblyscript/std/assembly.json", + "include": [ + "./**/*.ts" + ] +} \ No newline at end of file diff --git a/packages/gearhash-wasm/build/.gitignore b/packages/gearhash-wasm/build/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/gearhash-wasm/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/gearhash-wasm/package.json b/packages/gearhash-wasm/package.json new file mode 100644 index 0000000000..42b67a87c0 --- /dev/null +++ b/packages/gearhash-wasm/package.json @@ -0,0 +1,46 @@ +{ + "name": "@huggingface/gearhash-wasm", + "version": "0.0.1", + "scripts": { + "build:debug": "asc assembly/index.ts --target debug", + "build:release": "asc assembly/index.ts --target release", + "build": "pnpm run build:debug && npm run build:release", + "test": "vitest run", + "prepare": "pnpm run build" + }, + "keywords": [ + "gearhash", + "assemblyscript", + "assembly", + "wasm" + ], + "type": "module", + "main": "./build/release.js", + "types": "./build/release.d.ts", + "exports": { + ".": { + "import": "./build/release.js", + "types": "./build/release.d.ts" + }, + "./assembly": { + "import": "./assembly/index.ts" + }, + "./wasm": { + "import": "./build/release.wasm" + } + }, + "devDependencies": { + "assemblyscript": "0.27.36", + "@huggingface/splitmix64-wasm": "workspace:*" + }, + "files": [ + "build/release.js", + "build/release.d.ts", + "build/release.wasm", + "build/release.wat", + "build/release.wasm.map", + "README.md", + "asconfig.json", + "assembly" + ] +} diff --git a/packages/gearhash-wasm/pnpm-lock.yaml b/packages/gearhash-wasm/pnpm-lock.yaml new file mode 100644 index 0000000000..b8bd0747a6 --- /dev/null +++ b/packages/gearhash-wasm/pnpm-lock.yaml @@ -0,0 +1,41 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + devDependencies: + '@huggingface/splitmix64-wasm': + specifier: workspace:* + version: link:../splitmix64-wasm + assemblyscript: + specifier: 0.27.36 + version: 0.27.36 + +packages: + + assemblyscript@0.27.36: + resolution: {integrity: sha512-1qX2zf6p7l/mNYv8r21jC/Yft7kX7XKR3xUHw41zvV4xad5lyC8w7jZiwZBGoy64VKZLc+bTDJDWi8Kb70YrHA==} + engines: {node: '>=18', npm: '>=10'} + hasBin: true + + binaryen@116.0.0-nightly.20240114: + resolution: {integrity: sha512-0GZrojJnuhoe+hiwji7QFaL3tBlJoA+KFUN7ouYSDGZLSo9CKM8swQX8n/UcbR0d1VuZKU+nhogNzv423JEu5A==} + hasBin: true + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + +snapshots: + + assemblyscript@0.27.36: + dependencies: + binaryen: 116.0.0-nightly.20240114 + long: 5.3.2 + + binaryen@116.0.0-nightly.20240114: {} + + long@5.3.2: {} diff --git a/packages/gearhash-wasm/tests/index.test.ts b/packages/gearhash-wasm/tests/index.test.ts new file mode 100644 index 0000000000..35e28eba8a --- /dev/null +++ b/packages/gearhash-wasm/tests/index.test.ts @@ -0,0 +1,274 @@ +import { describe, it, expect } from "vitest"; +import { nextMatch, nextMatches } from "../build/debug.js"; + +// Simple deterministic RNG for reproducible results (24-bit version) +// Alternatively, could have used WASM for 64-bit arithmetic. +class SimpleRng { + private state: number; + + constructor(seed: number) { + this.state = seed & 0xffffff; // Keep only 24 bits + } + + nextU24(): number { + // Simple 24-bit linear congruential generator + // Using 24-bit arithmetic to avoid overflow + this.state = (this.state * 1111 + 12345) & 0xffffff; + return this.state; + } + + fillBytes(dest: Uint8Array): void { + for (let i = 0; i < dest.length; i += 3) { + const value = this.nextU24(); + for (let j = 0; j < 3 && i + j < dest.length; j++) { + dest[i + j] = (value >> (j * 8)) & 0xff; + } + } + } +} + +const BENCH_INPUT_SEED = 0xbecd17f; +const BENCH_MASK = 0x0000d90003530000n; +const INPUT_SIZE = 100_000; + +function generateTestInput(): Uint8Array { + const bytes = new Uint8Array(INPUT_SIZE); + const rng = new SimpleRng(BENCH_INPUT_SEED); + rng.fillBytes(bytes); + return bytes; +} + +interface TestResults { + chunkCount: number; + totalProcessed: number; + averageChunkSize: number; +} + +interface ExpectedResult { + chunk: number; + offset: number; + size: number; + hash: string; +} + +function testGearhash(): TestResults { + const inputBuf = generateTestInput(); + + let chunkCount = 0; + let totalProcessed = 0; + + const result = nextMatches(inputBuf, BENCH_MASK, 0n); + const matches = [...result.matches, { position: result.remaining, hash: result.hash }]; + + for (const match of matches) { + totalProcessed += match.position; + chunkCount += 1; + } + + return { chunkCount, totalProcessed, averageChunkSize: totalProcessed / chunkCount }; +} + +// Parse the expected results from Rust +function parseExpectedResults(resultData: string): ExpectedResult[] { + const lines = resultData.trim().split("\n"); + const results: ExpectedResult[] = []; + + for (const line of lines) { + const match = line.match(/\s*(\d+)\s*\|\s*(\d+)\s*\|\s*(\d+)\s*\|\s*(0x[a-f0-9]+)/); + if (match) { + results.push({ + chunk: parseInt(match[1]), + offset: parseInt(match[2]), + size: parseInt(match[3]), + hash: match[4], + }); + } + } + + return results; +} + +const resultData = `Chunk | Offset | Size | Hash +------|--------|------|------------------ + 1 | 0 | 3598 | 0x033220f080ac5f77 + 2 | 3598 | 3995 | 0xd06b22f324ac5f28 + 3 | 7593 | 4708 | 0xa3a324f81808429c + 4 | 12301 | 484 | 0x12a5006aa4a4425b + 5 | 12785 | 1484 | 0x0b240413a4a4d5a2 + 6 | 14269 | 563 | 0xc646022fbc848bc6 + 7 | 14832 | 6663 | 0x7c7a2296e4a4c325 + 8 | 21495 | 1220 | 0xbe1f2468f0841b68 + 9 | 22715 | 1175 | 0xf87e2299e00c57d9 + 10 | 23890 | 779 | 0x79ca2634d00cd6b9 + 11 | 24669 | 2069 | 0xcb7a063594081a74 + 12 | 26738 | 2623 | 0xdccc26b6c0acb733 + 13 | 29361 | 596 | 0x4fb6201a1c20143e + 14 | 29957 | 622 | 0x81e726272020706f + 15 | 30579 | 3834 | 0x630622fca084a60a + 16 | 34413 | 2379 | 0x177b2240080810b1 + 17 | 36792 | 3527 | 0x663b261bbc2451ed + 18 | 40319 | 1665 | 0xf94f06db94003e2f + 19 | 41984 | 1240 | 0xc5ca208c0c24cefc + 20 | 43224 | 1274 | 0x8139244f740cba39 + 21 | 44498 | 3680 | 0x4440044520045a9d + 22 | 48178 | 1487 | 0xe00f2049a0a43a58 + 23 | 49665 | 4293 | 0x366a26940408279d + 24 | 53958 | 1184 | 0x3a582683902cb3fe + 25 | 55142 | 383 | 0x002d0499e080702e + 26 | 55525 | 1206 | 0x34ba041aa4084fbd + 27 | 56731 | 506 | 0x0c53045c00a0a228 + 28 | 57237 | 8019 | 0xf85b202d9c0813a5 + 29 | 65256 | 1070 | 0x1c862295ac8863ba + 30 | 66326 | 3359 | 0x4e4804d7b82805c7 + 31 | 69685 | 1744 | 0x75b7224cc8209457 + 32 | 71429 | 152 | 0xb01e26b40c0cf7c0 + 33 | 71581 | 11 | 0xc66002b7f48c0472 + 34 | 71592 | 1209 | 0x0a33021dc4007363 + 35 | 72801 | 1795 | 0xd0cc22ea708c921f + 36 | 74596 | 856 | 0x49e3007c9c2c5727 + 37 | 75452 | 97 | 0xe0b422e3c40c89dc + 38 | 75549 | 1299 | 0xbd1806074024536a + 39 | 76848 | 131 | 0xd61104147c28928d + 40 | 76979 | 1987 | 0x31930627a080ebb0 + 41 | 78966 | 11254 | 0x4c4400e65c24beff + 42 | 90220 | 868 | 0xa92400ca5ca02488 + 43 | 91088 | 6279 | 0x5a3d0443f0a0d81a + 44 | 97367 | 969 | 0x7770042d140c7472 + 45 | 98336 | 1664 | 0xe508202f55c46d2d`; + +describe("gearhash-wasm", () => { + describe("Basic functionality", () => { + it("should generate test input correctly", () => { + const input = generateTestInput(); + expect(input.length).toBe(INPUT_SIZE); + + // Verify specific byte values for reproducibility + // These values may vary depending on the RNG implementation + expect(typeof input[0]).toBe("number"); + expect(input[0]).toBeGreaterThanOrEqual(0); + expect(input[0]).toBeLessThanOrEqual(255); + expect(typeof input[100]).toBe("number"); + expect(typeof input[1000]).toBe("number"); + }); + + it("should process chunks correctly", () => { + const testResults = testGearhash(); + + expect(testResults.chunkCount).toBeGreaterThan(0); + expect(testResults.totalProcessed).toBe(INPUT_SIZE); + expect(testResults.averageChunkSize).toBeGreaterThan(0); + }); + }); + + describe("Chunk matching accuracy", () => { + it("should match expected results from Rust implementation", () => { + const inputBuf = generateTestInput(); + const result = nextMatches(inputBuf, BENCH_MASK, 0n); + const allMatches = [...result.matches, { position: result.remaining, hash: result.hash }]; + + // Generate actual results in the same format as expected + const actualResults: ExpectedResult[] = []; + let offset = 0; + let chunkCount = 0; + + for (const match of allMatches) { + chunkCount += 1; + actualResults.push({ + chunk: chunkCount, + offset: offset, + size: match.position, + hash: `0x${match.hash.toString(16).padStart(16, "0")}`, + }); + offset += match.position; + } + + // Compare with expected results + const expectedResults = parseExpectedResults(resultData); + const totalChunks = Math.min(actualResults.length, expectedResults.length); + + expect(totalChunks).toBe(expectedResults.length); + expect(totalChunks).toBe(45); + + let matchCount = 0; + for (let i = 0; i < totalChunks; i++) { + const actual = actualResults[i]; + const expected = expectedResults[i]; + + if (actual.offset === expected.offset && actual.size === expected.size && actual.hash === expected.hash) { + matchCount++; + } + } + + // We expect at least 90% accuracy compared to Rust implementation + const accuracy = (matchCount / totalChunks) * 100; + expect(accuracy).toBeGreaterThanOrEqual(100); + }); + }); + + describe("Individual chunk processing", () => { + it("should process individual chunks correctly", () => { + const input = generateTestInput(); + let offset = 0; + let hash = 0n; + + while (offset < input.length) { + const result = nextMatch(input.subarray(offset), BENCH_MASK, hash); + + // Position can be -1 to indicate no match found + expect(result.position).toBeGreaterThanOrEqual(-1); + expect(typeof result.hash).toBe("bigint"); + + if (result.position > 0) { + offset += result.position; + hash = result.hash; + } else { + // No more matches, break + break; + } + } + }); + }); + + describe("Edge cases", () => { + it("should handle empty input", () => { + const emptyInput = new Uint8Array(0); + const result = nextMatches(emptyInput, BENCH_MASK, 0n); + + expect(result.matches.length).toBe(0); + expect(result.remaining).toBe(0); + }); + + it("should handle small input", () => { + const smallInput = new Uint8Array([1, 2, 3, 4, 5]); + const result = nextMatches(smallInput, BENCH_MASK, 0n); + + expect(result.matches.length).toBeGreaterThanOrEqual(0); + expect(result.remaining).toBeGreaterThanOrEqual(0); + }); + + it("should handle different masks", () => { + const input = generateTestInput().slice(0, 1000); + const differentMasks = [0x0000ff0000000000n, 0x00000000ff000000n, 0x000000000000ff00n]; + + for (const mask of differentMasks) { + const result = nextMatches(input, mask, 0n); + expect(result.matches.length).toBeGreaterThanOrEqual(0); + } + }); + }); + + describe("Performance characteristics", () => { + it("should maintain reasonable chunk sizes", () => { + const testResults = testGearhash(); + + // Average chunk size should be reasonable (not too small, not too large) + expect(testResults.averageChunkSize).toBeGreaterThan(100); + expect(testResults.averageChunkSize).toBeLessThan(10000); + }); + + it("should process all input data", () => { + const testResults = testGearhash(); + expect(testResults.totalProcessed).toBe(INPUT_SIZE); + }); + }); +}); diff --git a/packages/gearhash-wasm/vendor/.gitignore b/packages/gearhash-wasm/vendor/.gitignore new file mode 100644 index 0000000000..293dd90a84 --- /dev/null +++ b/packages/gearhash-wasm/vendor/.gitignore @@ -0,0 +1,4 @@ +/target +**/*.rs.bk +Cargo.lock +.idea \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/Cargo.toml b/packages/gearhash-wasm/vendor/Cargo.toml new file mode 100644 index 0000000000..e425f8932a --- /dev/null +++ b/packages/gearhash-wasm/vendor/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "gearhash" +version = "0.1.3" +edition = "2018" +license = "MIT OR Apache-2.0" +authors = ["Sam Rijs "] +description = "Fast, SIMD-accelerated hash function for content-defined chunking" +repository = "https://github.com/srijs/rust-gearhash" +readme = "README.md" +keywords = ["hash", "gear", "fast", "cdc", "chunking"] + +[features] +bench = [] + +[dependencies] +cfg-if = "0.1.10" + +[dev-dependencies] +lazy_static = "1.4.0" +quickcheck = "0.9.0" +rand = "0.7.2" + +[[bin]] +name = "test_gearhash" +path = "test_gearhash.rs" \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/LICENSE-APACHE b/packages/gearhash-wasm/vendor/LICENSE-APACHE new file mode 100644 index 0000000000..2cdf43fa3e --- /dev/null +++ b/packages/gearhash-wasm/vendor/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/LICENSE-MIT b/packages/gearhash-wasm/vendor/LICENSE-MIT new file mode 100644 index 0000000000..487d7160eb --- /dev/null +++ b/packages/gearhash-wasm/vendor/LICENSE-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Sam Rijs and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/README.md b/packages/gearhash-wasm/vendor/README.md new file mode 100644 index 0000000000..34310148b7 --- /dev/null +++ b/packages/gearhash-wasm/vendor/README.md @@ -0,0 +1,60 @@ +# GearHash Test + +This directory contains the GearHash library for content-defined chunking. + +## Running the Test + +To run the test that generates deterministic input and processes it through GearHash: + +```bash +cd packages/gearhash-wasm/vendor +cargo run --bin test_gearhash +``` + +Or if you want to create a binary: + +```bash +cargo build --bin test_gearhash +./target/debug/test_gearhash +``` + +## Test Details + +The test: +1. Generates a 1MB deterministic input using a simple xorshift RNG with seed `0xa383d96f7becd17e` +2. Uses mask `0x0000d90003530000` for chunk boundary detection +3. Processes the input through GearHash and reports chunk boundaries +4. Shows chunk sizes, offsets, and hash values for verification + +## AssemblyScript Adaptation + +The test uses a simple deterministic RNG that can be easily ported to AssemblyScript: + +```typescript +class SimpleRng { + private state: u64; + + constructor(seed: u64) { + this.state = seed; + } + + nextU64(): u64 { + this.state ^= this.state << 13; + this.state ^= this.state >> 7; + this.state ^= this.state << 17; + return this.state; + } + + fillBytes(dest: Uint8Array): void { + for (let i = 0; i < dest.length; i += 8) { + const value = this.nextU64(); + for (let j = 0; j < 8 && i + j < dest.length; j++) { + dest[i + j] = (value >> (j * 8)) as u8; + } + } + } +} +``` + +The test results can be used to verify that the AssemblyScript implementation produces the same chunk boundaries. + diff --git a/packages/gearhash-wasm/vendor/src/lib.rs b/packages/gearhash-wasm/vendor/src/lib.rs new file mode 100644 index 0000000000..58aa95aeee --- /dev/null +++ b/packages/gearhash-wasm/vendor/src/lib.rs @@ -0,0 +1,103 @@ +// From https://github.com/srijs/rust-gearhash/blob/master/src/lib.rs + +//! The GEAR hashing function is a fast, rolling hash function that +//! is well suited for content defined chunking. In particular, it is +//! used as a building block for the [FastCDC](https://www.usenix.org/node/196197) +//! algorithm. +//! +//! The implementation provided in this crate consists of both a simple, +//! scalar variant, as well as optimized versions for the SSE4.2 and AVX2 +//! instruction sets. +//! +//! ## Example +//! +//! ``` +//! fn find_all_chunks(buf: &[u8], mask: u64) -> Vec<&[u8]> { +//! // set up initial state +//! let mut chunks = vec![]; +//! let mut offset = 0; +//! +//! // create new hasher +//! let mut hasher = gearhash::Hasher::default(); +//! +//! // loop through all matches, and push the corresponding chunks +//! while let Some(boundary) = hasher.next_match(&buf[offset..], mask) { +//! chunks.push(&buf[offset..offset + boundary]); +//! offset += boundary; +//! } +//! +//! // push final chunk +//! chunks.push(&buf[offset..]); +//! chunks +//! } +//! ``` + +#![cfg_attr(feature = "bench", feature(test))] + +#[cfg(feature = "bench")] +extern crate test; +#[cfg(feature = "bench")] +mod bench; + +mod scalar; +mod table; + +pub use table::{Table, DEFAULT_TABLE}; + +/// Gear hash state. Processes bytes to find chunk boundaries. +#[derive(Clone)] +pub struct Hasher<'t> { + table: &'t Table, + hash: u64, +} + +impl<'t> Hasher<'t> { + /// Create a new hasher with the given table. + pub fn new(table: &'t Table) -> Self { + Self { table, hash: 0 } + } + + /// Update the hash state by processing all the bytes in the given slice. + pub fn update(&mut self, buf: &[u8]) { + for b in buf.iter() { + self.hash = (self.hash << 1).wrapping_add(self.table[*b as usize]); + } + } + + /// Match the current hash state against the given mask. + /// + /// Returns true if `hash & mask == 0`, false otherwise. + pub fn is_match(&self, mask: u64) -> bool { + self.hash & mask == 0 + } + + /// Processes the given byte slice until a match is found for the given mask. + /// + /// If a match is found before the end of the byte slice, it returns the number + /// of bytes processed. If no match has been found, it returns `None`. + pub fn next_match(&mut self, buf: &[u8], mask: u64) -> Option { + crate::scalar::next_match(&mut self.hash, self.table, buf, mask) + } + + /// Retrieve the current hash value. + pub fn get_hash(&self) -> u64 { + self.hash + } + + /// Set the hash value to the given integer. + pub fn set_hash(&mut self, hash: u64) { + self.hash = hash + } +} + +impl Default for Hasher<'static> { + fn default() -> Self { + Hasher::new(&DEFAULT_TABLE) + } +} + +impl<'t> std::fmt::Debug for Hasher<'t> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("Hasher").field("hash", &self.hash).finish() + } +} \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/src/scalar.rs b/packages/gearhash-wasm/vendor/src/scalar.rs new file mode 100644 index 0000000000..f06ab449f6 --- /dev/null +++ b/packages/gearhash-wasm/vendor/src/scalar.rs @@ -0,0 +1,14 @@ +use crate::Table; + +#[inline] +pub(crate) fn next_match(hash: &mut u64, table: &Table, buf: &[u8], mask: u64) -> Option { + for (i, b) in buf.iter().enumerate() { + *hash = (*hash << 1).wrapping_add(table[*b as usize]); + + if *hash & mask == 0 { + return Some(i + 1); + } + } + + None +} diff --git a/packages/gearhash-wasm/vendor/src/table.rs b/packages/gearhash-wasm/vendor/src/table.rs new file mode 100644 index 0000000000..898e603422 --- /dev/null +++ b/packages/gearhash-wasm/vendor/src/table.rs @@ -0,0 +1,262 @@ +/// Gear hash table. +pub type Table = [u64; 256]; + +/// Default hash table, using random (but static) integers. +pub static DEFAULT_TABLE: Table = [ + 0xb088d3a9e840f559, + 0x5652c7f739ed20d6, + 0x45b28969898972ab, + 0x6b0a89d5b68ec777, + 0x368f573e8b7a31b7, + 0x1dc636dce936d94b, + 0x207a4c4e5554d5b6, + 0xa474b34628239acb, + 0x3b06a83e1ca3b912, + 0x90e78d6c2f02baf7, + 0xe1c92df7150d9a8a, + 0x8e95053a1086d3ad, + 0x5a2ef4f1b83a0722, + 0xa50fac949f807fae, + 0x0e7303eb80d8d681, + 0x99b07edc1570ad0f, + 0x689d2fb555fd3076, + 0x00005082119ea468, + 0xc4b08306a88fcc28, + 0x3eb0678af6374afd, + 0xf19f87ab86ad7436, + 0xf2129fbfbe6bc736, + 0x481149575c98a4ed, + 0x0000010695477bc5, + 0x1fba37801a9ceacc, + 0x3bf06fd663a49b6d, + 0x99687e9782e3874b, + 0x79a10673aa50d8e3, + 0xe4accf9e6211f420, + 0x2520e71f87579071, + 0x2bd5d3fd781a8a9b, + 0x00de4dcddd11c873, + 0xeaa9311c5a87392f, + 0xdb748eb617bc40ff, + 0xaf579a8df620bf6f, + 0x86a6e5da1b09c2b1, + 0xcc2fc30ac322a12e, + 0x355e2afec1f74267, + 0x2d99c8f4c021a47b, + 0xbade4b4a9404cfc3, + 0xf7b518721d707d69, + 0x3286b6587bf32c20, + 0x0000b68886af270c, + 0xa115d6e4db8a9079, + 0x484f7e9c97b2e199, + 0xccca7bb75713e301, + 0xbf2584a62bb0f160, + 0xade7e813625dbcc8, + 0x000070940d87955a, + 0x8ae69108139e626f, + 0xbd776ad72fde38a2, + 0xfb6b001fc2fcc0cf, + 0xc7a474b8e67bc427, + 0xbaf6f11610eb5d58, + 0x09cb1f5b6de770d1, + 0xb0b219e6977d4c47, + 0x00ccbc386ea7ad4a, + 0xcc849d0adf973f01, + 0x73a3ef7d016af770, + 0xc807d2d386bdbdfe, + 0x7f2ac9966c791730, + 0xd037a86bc6c504da, + 0xf3f17c661eaa609d, + 0xaca626b04daae687, + 0x755a99374f4a5b07, + 0x90837ee65b2caede, + 0x6ee8ad93fd560785, + 0x0000d9e11053edd8, + 0x9e063bb2d21cdbd7, + 0x07ab77f12a01d2b2, + 0xec550255e6641b44, + 0x78fb94a8449c14c6, + 0xc7510e1bc6c0f5f5, + 0x0000320b36e4cae3, + 0x827c33262c8b1a2d, + 0x14675f0b48ea4144, + 0x267bd3a6498deceb, + 0xf1916ff982f5035e, + 0x86221b7ff434fb88, + 0x9dbecee7386f49d8, + 0xea58f8cac80f8f4a, + 0x008d198692fc64d8, + 0x6d38704fbabf9a36, + 0xe032cb07d1e7be4c, + 0x228d21f6ad450890, + 0x635cb1bfc02589a5, + 0x4620a1739ca2ce71, + 0xa7e7dfe3aae5fb58, + 0x0c10ca932b3c0deb, + 0x2727fee884afed7b, + 0xa2df1c6df9e2ab1f, + 0x4dcdd1ac0774f523, + 0x000070ffad33e24e, + 0xa2ace87bc5977816, + 0x9892275ab4286049, + 0xc2861181ddf18959, + 0xbb9972a042483e19, + 0xef70cd3766513078, + 0x00000513abfc9864, + 0xc058b61858c94083, + 0x09e850859725e0de, + 0x9197fb3bf83e7d94, + 0x7e1e626d12b64bce, + 0x520c54507f7b57d1, + 0xbee1797174e22416, + 0x6fd9ac3222e95587, + 0x0023957c9adfbf3e, + 0xa01c7d7e234bbe15, + 0xaba2c758b8a38cbb, + 0x0d1fa0ceec3e2b30, + 0x0bb6a58b7e60b991, + 0x4333dd5b9fa26635, + 0xc2fd3b7d4001c1a3, + 0xfb41802454731127, + 0x65a56185a50d18cb, + 0xf67a02bd8784b54f, + 0x696f11dd67e65063, + 0x00002022fca814ab, + 0x8cd6be912db9d852, + 0x695189b6e9ae8a57, + 0xee9453b50ada0c28, + 0xd8fc5ea91a78845e, + 0xab86bf191a4aa767, + 0x0000c6b5c86415e5, + 0x267310178e08a22e, + 0xed2d101b078bca25, + 0x3b41ed84b226a8fb, + 0x13e622120f28dc06, + 0xa315f5ebfb706d26, + 0x8816c34e3301bace, + 0xe9395b9cbb71fdae, + 0x002ce9202e721648, + 0x4283db1d2bb3c91c, + 0xd77d461ad2b1a6a5, + 0xe2ec17e46eeb866b, + 0xb8e0be4039fbc47c, + 0xdea160c4d5299d04, + 0x7eec86c8d28c3634, + 0x2119ad129f98a399, + 0xa6ccf46b61a283ef, + 0x2c52cedef658c617, + 0x2db4871169acdd83, + 0x0000f0d6f39ecbe9, + 0x3dd5d8c98d2f9489, + 0x8a1872a22b01f584, + 0xf282a4c40e7b3cf2, + 0x8020ec2ccb1ba196, + 0x6693b6e09e59e313, + 0x0000ce19cc7c83eb, + 0x20cb5735f6479c3b, + 0x762ebf3759d75a5b, + 0x207bfe823d693975, + 0xd77dc112339cd9d5, + 0x9ba7834284627d03, + 0x217dc513e95f51e9, + 0xb27b1a29fc5e7816, + 0x00d5cd9831bb662d, + 0x71e39b806d75734c, + 0x7e572af006fb1a23, + 0xa2734f2f6ae91f85, + 0xbf82c6b5022cddf2, + 0x5c3beac60761a0de, + 0xcdc893bb47416998, + 0x6d1085615c187e01, + 0x77f8ae30ac277c5d, + 0x917c6b81122a2c91, + 0x5b75b699add16967, + 0x0000cf6ae79a069b, + 0xf3c40afa60de1104, + 0x2063127aa59167c3, + 0x621de62269d1894d, + 0xd188ac1de62b4726, + 0x107036e2154b673c, + 0x0000b85f28553a1d, + 0xf2ef4e4c18236f3d, + 0xd9d6de6611b9f602, + 0xa1fc7955fb47911c, + 0xeb85fd032f298dbd, + 0xbe27502fb3befae1, + 0xe3034251c4cd661e, + 0x441364d354071836, + 0x0082b36c75f2983e, + 0xb145910316fa66f0, + 0x021c069c9847caf7, + 0x2910dfc75a4b5221, + 0x735b353e1c57a8b5, + 0xce44312ce98ed96c, + 0xbc942e4506bdfa65, + 0xf05086a71257941b, + 0xfec3b215d351cead, + 0x00ae1055e0144202, + 0xf54b40846f42e454, + 0x00007fd9c8bcbcc8, + 0xbfbd9ef317de9bfe, + 0xa804302ff2854e12, + 0x39ce4957a5e5d8d4, + 0xffb9e2a45637ba84, + 0x55b9ad1d9ea0818b, + 0x00008acbf319178a, + 0x48e2bfc8d0fbfb38, + 0x8be39841e848b5e8, + 0x0e2712160696a08b, + 0xd51096e84b44242a, + 0x1101ba176792e13a, + 0xc22e770f4531689d, + 0x1689eff272bbc56c, + 0x00a92a197f5650ec, + 0xbc765990bda1784e, + 0xc61441e392fcb8ae, + 0x07e13a2ced31e4a0, + 0x92cbe984234e9d4d, + 0x8f4ff572bb7d8ac5, + 0x0b9670c00b963bd0, + 0x62955a581a03eb01, + 0x645f83e5ea000254, + 0x41fce516cd88f299, + 0xbbda9748da7a98cf, + 0x0000aab2fe4845fa, + 0x19761b069bf56555, + 0x8b8f5e8343b6ad56, + 0x3e5d1cfd144821d9, + 0xec5c1e2ca2b0cd8f, + 0xfaf7e0fea7fbb57f, + 0x000000d3ba12961b, + 0xda3f90178401b18e, + 0x70ff906de33a5feb, + 0x0527d5a7c06970e7, + 0x22d8e773607c13e9, + 0xc9ab70df643c3bac, + 0xeda4c6dc8abe12e3, + 0xecef1f410033e78a, + 0x0024c2b274ac72cb, + 0x06740d954fa900b4, + 0x1d7a299b323d6304, + 0xb3c37cb298cbead5, + 0xc986e3c76178739b, + 0x9fabea364b46f58a, + 0x6da214c5af85cc56, + 0x17a43ed8b7a38f84, + 0x6eccec511d9adbeb, + 0xf9cab30913335afb, + 0x4a5e60c5f415eed2, + 0x00006967503672b4, + 0x9da51d121454bb87, + 0x84321e13b9bbc816, + 0xfb3d6fb6ab2fdd8d, + 0x60305eed8e160a8d, + 0xcbbf4b14e9946ce8, + 0x00004f63381b10c3, + 0x07d5b7816fcc4e10, + 0xe5a536726a6a8155, + 0x57afb23447a07fdd, + 0x18f346f7abc9d394, + 0x636dc655d61ad33d, + 0xcc8bab4939f7f3f6, + 0x63c7a906c1dd187b, +]; \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/test_gearhash.rs b/packages/gearhash-wasm/vendor/test_gearhash.rs new file mode 100644 index 0000000000..99606f1752 --- /dev/null +++ b/packages/gearhash-wasm/vendor/test_gearhash.rs @@ -0,0 +1,109 @@ +use gearhash::{Hasher, DEFAULT_TABLE}; + +// Simple deterministic RNG for reproducible results (24-bit version) +struct SimpleRng { + state: u32, +} + +impl SimpleRng { + fn new(seed: u32) -> Self { + Self { state: seed & 0xFFFFFF } // Keep only 24 bits + } + + fn next_u24(&mut self) -> u32 { + // Simple 24-bit linear congruential generator + // Using 24-bit arithmetic to avoid overflow + self.state = (self.state.wrapping_mul(1111) + 12345) & 0xFFFFFF; + self.state + } + + fn fill_bytes(&mut self, dest: &mut [u8]) { + for chunk in dest.chunks_mut(3) { + let value = self.next_u24(); + for (i, byte) in chunk.iter_mut().enumerate() { + *byte = ((value >> (i * 8)) & 0xFF) as u8; + } + } + } +} + +const BENCH_INPUT_SEED: u32 = 0xbecd17f; +const BENCH_MASK: u64 = 0x0000d90003530000; +const INPUT_SIZE: usize = 100_000; + +fn generate_test_input() -> Vec { + let mut bytes = vec![0u8; INPUT_SIZE]; + let mut rng = SimpleRng::new(BENCH_INPUT_SEED); + rng.fill_bytes(&mut bytes); + bytes +} + +fn test_gearhash() { + println!("Generating test input with seed: 0x{:x}", BENCH_INPUT_SEED); + let input_buf = generate_test_input(); + println!("Input size: {} bytes", input_buf.len()); + println!("Mask: 0x{:x}", BENCH_MASK); + + let mut hasher = Hasher::new(&DEFAULT_TABLE); + let mut offset = 0; + let mut chunk_count = 0; + let mut total_processed = 0; + + println!("\nProcessing chunks:"); + println!("Chunk | Offset | Size | Hash"); + println!("------|--------|------|------------------"); + + while offset < input_buf.len() { + let chunk_start = offset; + + if let Some(match_size) = hasher.next_match(&input_buf[offset..], BENCH_MASK) { + offset += match_size; + total_processed += match_size; + chunk_count += 1; + + println!("{:5} | {:6} | {:4} | 0x{:016x}", + chunk_count, chunk_start, match_size, hasher.get_hash()); + + hasher.set_hash(0); + } else { + // No more matches, process remaining bytes + let remaining = input_buf.len() - offset; + total_processed += remaining; + chunk_count += 1; + + println!("{:5} | {:6} | {:4} | 0x{:016x} (final)", + chunk_count, offset, remaining, hasher.get_hash()); + break; + } + } + + println!("\nSummary:"); + println!("Total chunks: {}", chunk_count); + println!("Total bytes processed: {}", total_processed); + println!("Average chunk size: {:.1} bytes", total_processed as f64 / chunk_count as f64); + + // Print first few bytes of each chunk for verification + println!("\nFirst 16 bytes of each chunk:"); + offset = 0; + chunk_count = 0; + + while offset < input_buf.len() { + if let Some(match_size) = hasher.next_match(&input_buf[offset..], BENCH_MASK) { + let chunk = &input_buf[offset..offset + match_size]; + println!("Chunk {}: {:02x?}", chunk_count + 1, &chunk[..chunk.len().min(16)]); + offset += match_size; + chunk_count += 1; + } else { + let chunk = &input_buf[offset..]; + println!("Chunk {}: {:02x?} (final)", chunk_count + 1, &chunk[..chunk.len().min(16)]); + break; + } + } +} + +fn main() { + test_gearhash(); + + let input_buf = generate_test_input(); + println!("First 100 bytes: {:02x?}", &input_buf[..100]); +} \ No newline at end of file diff --git a/packages/gearhash-wasm/vitest.config.ts b/packages/gearhash-wasm/vitest.config.ts new file mode 100644 index 0000000000..b6d61dc045 --- /dev/null +++ b/packages/gearhash-wasm/vitest.config.ts @@ -0,0 +1,13 @@ +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + globals: true, + environment: "node", + }, + resolve: { + alias: { + "@huggingface/splitmix64-wasm": "./node_modules/@huggingface/splitmix64-wasm/build/release.js", + }, + }, +}); diff --git a/packages/splitmix64-wasm/.npmignore b/packages/splitmix64-wasm/.npmignore new file mode 100644 index 0000000000..5657f6ea7d --- /dev/null +++ b/packages/splitmix64-wasm/.npmignore @@ -0,0 +1 @@ +vendor \ No newline at end of file diff --git a/packages/splitmix64-wasm/README.md b/packages/splitmix64-wasm/README.md new file mode 100644 index 0000000000..28b3f6c79c --- /dev/null +++ b/packages/splitmix64-wasm/README.md @@ -0,0 +1,18 @@ +JS and WASM implementations of splitmix-64 + +Using [AssemblyScript](https://www.assemblyscript.org/) to generate a lean WASM. + +The use of WASM is more for 64 bit arithmetic than for performance. + +Used internally to reproduce rust tests + +Let us know if you want us to expose more functions. + +## Usage + +```javascript +import { createRandomArray } from '@huggingface/splitmix64-wasm'; + +// Create an ArrayBuffer of data, with u64s converted to le u8s +const data = new createRandomArray(256_000, 1); // Example: 256kB of data +``` \ No newline at end of file diff --git a/packages/splitmix64-wasm/asconfig.json b/packages/splitmix64-wasm/asconfig.json new file mode 100644 index 0000000000..8776597856 --- /dev/null +++ b/packages/splitmix64-wasm/asconfig.json @@ -0,0 +1,22 @@ +{ + "targets": { + "debug": { + "outFile": "build/debug.wasm", + "textFile": "build/debug.wat", + "sourceMap": true, + "debug": true + }, + "release": { + "outFile": "build/release.wasm", + "textFile": "build/release.wat", + "sourceMap": true, + "optimizeLevel": 3, + "shrinkLevel": 0, + "converge": false, + "noAssert": false + } + }, + "options": { + "bindings": "esm" + } +} \ No newline at end of file diff --git a/packages/splitmix64-wasm/assembly/index.ts b/packages/splitmix64-wasm/assembly/index.ts new file mode 100644 index 0000000000..8ac90cf7d9 --- /dev/null +++ b/packages/splitmix64-wasm/assembly/index.ts @@ -0,0 +1,40 @@ +// fn splitmix64_next(state: &mut u64) -> u64 { +// *state = state.wrapping_add(0x9E3779B97F4A7C15); +// let mut z = *state; +// z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9); +// z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB); +// z ^ (z >> 31) +// } + +// fn create_random_data(n: usize, seed: u64) -> Vec { +// // This test will actually need to be run in different environments, so to generate +// // the table below, create random data using a simple SplitMix rng that can be ported here +// // as is without dependening on other packages. +// let mut ret = Vec::with_capacity(n + 7); + +// let mut state = seed; + +// while ret.len() < n { +// let next_u64 = splitmix64_next(&mut state); +// ret.extend_from_slice(&next_u64.to_le_bytes()); +// } + +// // Has extra bits on there since we're adding in blocks of 8. +// ret.resize(n, 0); + +// ret +// } + +export function createRandomArray(size: u32, seed: u64): ArrayBuffer { + const array = new ArrayBuffer(size + 7); + const view = new DataView(array); + let state = seed; + for (let i: u32 = 0; i < size; i += 8) { + state = state + 0x9e3779b97f4a7c15; + let z = state; + z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; + z = (z ^ (z >> 27)) * 0x94d049bb133111eb; + view.setUint64(i, z ^ (z >> 31), true); + } + return array.slice(0, size); +} diff --git a/packages/splitmix64-wasm/assembly/tsconfig.json b/packages/splitmix64-wasm/assembly/tsconfig.json new file mode 100644 index 0000000000..8131d68a0a --- /dev/null +++ b/packages/splitmix64-wasm/assembly/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../node_modules/.pnpm/assemblyscript@0.27.36/node_modules/assemblyscript/std/assembly.json", + "include": ["./**/*.ts"] +} diff --git a/packages/splitmix64-wasm/build/.gitignore b/packages/splitmix64-wasm/build/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/splitmix64-wasm/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/splitmix64-wasm/package.json b/packages/splitmix64-wasm/package.json new file mode 100644 index 0000000000..69d91cfae8 --- /dev/null +++ b/packages/splitmix64-wasm/package.json @@ -0,0 +1,45 @@ +{ + "name": "@huggingface/splitmix64-wasm", + "version": "0.0.1", + "scripts": { + "build:debug": "asc assembly/index.ts --target debug", + "build:release": "asc assembly/index.ts --target release", + "build": "pnpm run build:debug && npm run build:release", + "test": "node tests", + "prepare": "pnpm run build" + }, + "keywords": [ + "splitmix64", + "assemblyscript", + "assembly", + "wasm" + ], + "type": "module", + "exports": { + ".": { + "import": "./build/release.js", + "types": "./build/release.d.ts" + }, + "./assembly": { + "import": "./assembly/index.ts" + }, + "./wasm": { + "import": "./build/release.wasm" + } + }, + "main": "./build/release.js", + "types": "./build/release.d.ts", + "devDependencies": { + "assemblyscript": "0.27.36" + }, + "files": [ + "build/release.js", + "build/release.d.ts", + "build/release.wasm", + "build/release.wat", + "build/release.wasm.map", + "README.md", + "asconfig.json", + "assembly" + ] +} diff --git a/packages/splitmix64-wasm/pnpm-lock.yaml b/packages/splitmix64-wasm/pnpm-lock.yaml new file mode 100644 index 0000000000..9d7ac0a92a --- /dev/null +++ b/packages/splitmix64-wasm/pnpm-lock.yaml @@ -0,0 +1,38 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + devDependencies: + assemblyscript: + specifier: 0.27.36 + version: 0.27.36 + +packages: + + assemblyscript@0.27.36: + resolution: {integrity: sha512-1qX2zf6p7l/mNYv8r21jC/Yft7kX7XKR3xUHw41zvV4xad5lyC8w7jZiwZBGoy64VKZLc+bTDJDWi8Kb70YrHA==} + engines: {node: '>=18', npm: '>=10'} + hasBin: true + + binaryen@116.0.0-nightly.20240114: + resolution: {integrity: sha512-0GZrojJnuhoe+hiwji7QFaL3tBlJoA+KFUN7ouYSDGZLSo9CKM8swQX8n/UcbR0d1VuZKU+nhogNzv423JEu5A==} + hasBin: true + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + +snapshots: + + assemblyscript@0.27.36: + dependencies: + binaryen: 116.0.0-nightly.20240114 + long: 5.3.2 + + binaryen@116.0.0-nightly.20240114: {} + + long@5.3.2: {} diff --git a/packages/splitmix64-wasm/tests/index.js b/packages/splitmix64-wasm/tests/index.js new file mode 100644 index 0000000000..707d841ccd --- /dev/null +++ b/packages/splitmix64-wasm/tests/index.js @@ -0,0 +1,29 @@ +// #[test] +// fn test_correctness_1mb_random_data() { +// // Test this data. +// let data = create_random_data(1000000, 0); + +// // Uncomment these to create the lines below: +// // eprintln!("(data[0], {});", data[0] as usize); +// // eprintln!("(data[127], {});", data[127] as usize); +// // eprintln!("(data[111111], {});", data[111111] as usize); + +// assert_eq!(data[0], 175); +// assert_eq!(data[127], 132); +// assert_eq!(data[111111], 118); + +// } + +import assert from "assert"; +import { createRandomArray } from "../build/debug.js"; + +const data = createRandomArray(1000000, 0); +const array = new Uint8Array(data); + +console.log(array[0]); +console.log(array[127]); +console.log(array[111111]); + +assert.strictEqual(array[0], 175); +assert.strictEqual(array[127], 132); +assert.strictEqual(array[111111], 118); diff --git a/packages/xetchunk-wasm/README.md b/packages/xetchunk-wasm/README.md new file mode 100644 index 0000000000..9bebca391e --- /dev/null +++ b/packages/xetchunk-wasm/README.md @@ -0,0 +1,36 @@ +JS and WASM implementations of https://github.com/huggingface/xet-core/blob/main/deduplication/src/chunking.rs + +Using [AssemblyScript](https://www.assemblyscript.org/) to generate a lean WASM. + +## Usage + +```javascript +import { createChunker, getChunks, nextBlock, finalize, xorbHash } from '@huggingface/xetchunk-wasm'; + +const TARGET_CHUNK_SIZE = Math.pow(2, 12); + +// Create a Uint8Array of data to search through +const data = new Uint8Array(1000000); // Example: 1MB of data +// ... fill data with your content ... + +const chunks = getChunks(data, TARGET_CHUNK_SIZE); +console.log("xorbHash", xorbHasht(chunks)); + +// Alternative, in case your data is streaming +const chunker = createChunker(TARGET_CHUNK_SIZE); + +for await (const data of source) { + const chunks = nextBlock(chunker, data); + console.log(chunks); +} + +console.log("last chunk", finalize(chunker)); +``` + +## Beanchmarking chunking + +```shell +pnpm install +pnpm --filter xetchunk-wasm build +pnpm --filter xetchunk-wasm bench path/to/a-big-file +``` \ No newline at end of file diff --git a/packages/xetchunk-wasm/asconfig.json b/packages/xetchunk-wasm/asconfig.json new file mode 100644 index 0000000000..b0711e8472 --- /dev/null +++ b/packages/xetchunk-wasm/asconfig.json @@ -0,0 +1,22 @@ +{ + "targets": { + "debug": { + "outFile": "build/debug.wasm", + "textFile": "build/debug.wat", + "sourceMap": true, + "debug": true + }, + "release": { + "outFile": "build/release.wasm", + "textFile": "build/release.wat", + "sourceMap": true, + "optimizeLevel": 3, + "shrinkLevel": 0, + "converge": false, + "noAssert": false + } + }, + "options": { + "bindings": "esm" + } +} diff --git a/packages/xetchunk-wasm/assembly/index.ts b/packages/xetchunk-wasm/assembly/index.ts new file mode 100644 index 0000000000..044729e593 --- /dev/null +++ b/packages/xetchunk-wasm/assembly/index.ts @@ -0,0 +1,2 @@ +export { createChunker, finalize, nextBlock, getChunks, hashToHex } from "./xet-chunker"; +export { xorbHash } from "./xorb-hash"; diff --git a/packages/xetchunk-wasm/assembly/tsconfig.json b/packages/xetchunk-wasm/assembly/tsconfig.json new file mode 100644 index 0000000000..8131d68a0a --- /dev/null +++ b/packages/xetchunk-wasm/assembly/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../node_modules/.pnpm/assemblyscript@0.27.36/node_modules/assemblyscript/std/assembly.json", + "include": ["./**/*.ts"] +} diff --git a/packages/xetchunk-wasm/assembly/xet-chunker.ts b/packages/xetchunk-wasm/assembly/xet-chunker.ts new file mode 100644 index 0000000000..4a0e957f63 --- /dev/null +++ b/packages/xetchunk-wasm/assembly/xet-chunker.ts @@ -0,0 +1,174 @@ +import { nextMatch } from "@huggingface/gearhash-wasm/assembly"; +import { blake3Keyed } from "@huggingface/blake3-wasm/assembly"; + +// Constants +const TARGET_CHUNK_SIZE: i32 = 64 * 1024; // 64KB +const MINIMUM_CHUNK_DIVISOR: i32 = 8; +const MAXIMUM_CHUNK_MULTIPLIER: i32 = 2; +const HASH_WINDOW_SIZE: i32 = 64; + +const BLAKE3_DATA_KEY = new Uint8Array(32); +const STATIC_KEY: StaticArray = [ + 102, 151, 245, 119, 91, 149, 80, 222, 49, 53, 203, 172, 165, 151, 24, 28, 157, 228, 33, 16, 155, 235, 43, 88, 180, + 208, 176, 75, 147, 173, 242, 41, +]; +for (let i = 0; i < 32; i++) { + BLAKE3_DATA_KEY[i] = STATIC_KEY[i]; +} + +export class Chunk { + hash: Uint8Array; + length: i32; +} + +// Type for the next() method return value +class NextResult { + chunk: Chunk | null; + bytesConsumed: i32; +} + +class XetChunker { + private minimumChunk: i32; + private maximumChunk: i32; + private mask: u64; + private chunkBuf: Uint8Array; + private curChunkLen: i32; + private hash: u64; + + constructor(targetChunkSize: i32 = TARGET_CHUNK_SIZE) { + // Validate target chunk size is a power of 2 + assert(targetChunkSize > 0, "Target chunk size must be greater than 0"); + assert((targetChunkSize & (targetChunkSize - 1)) == 0, "Target chunk size must be a power of 2"); + assert(targetChunkSize > HASH_WINDOW_SIZE, "Target chunk size must be greater than hash window size"); + assert(targetChunkSize < i32.MAX_VALUE, "Target chunk size must be less than i32.MAX_VALUE"); + + let mask = (targetChunkSize - 1) as u64; + // Shift mask left by leading zeros count + mask = mask << clz(mask); + + const maximumChunk = targetChunkSize * MAXIMUM_CHUNK_MULTIPLIER; + + this.minimumChunk = targetChunkSize / MINIMUM_CHUNK_DIVISOR; + this.maximumChunk = maximumChunk; + this.mask = mask; + this.chunkBuf = new Uint8Array(maximumChunk); + this.curChunkLen = 0; + this.hash = 0; + } + + next(data: Uint8Array, isFinal: boolean): NextResult { + const nBytes = data.length; + let createChunk = false; + let consumeLen: i32 = 0; + + if (nBytes != 0) { + // Skip minimum chunk size + if (this.curChunkLen + HASH_WINDOW_SIZE < this.minimumChunk) { + const maxAdvance = min(this.minimumChunk - this.curChunkLen - HASH_WINDOW_SIZE - 1, nBytes - consumeLen); + consumeLen += maxAdvance; + this.curChunkLen += maxAdvance; + } + + // Calculate read end + const readEnd = min(nBytes, consumeLen + this.maximumChunk - this.curChunkLen); + + let bytesToNextBoundary: i32; + const matchResult = nextMatch(data.subarray(consumeLen, readEnd), this.mask, this.hash); + + if (matchResult.position != -1) { + bytesToNextBoundary = matchResult.position; + createChunk = true; + this.hash = matchResult.hash; + } else { + bytesToNextBoundary = readEnd - consumeLen; + this.hash = matchResult.hash; + } + + // Check if we hit maximum chunk + if (bytesToNextBoundary + this.curChunkLen >= this.maximumChunk) { + bytesToNextBoundary = this.maximumChunk - this.curChunkLen; + createChunk = true; + } + + this.curChunkLen += bytesToNextBoundary; + consumeLen += bytesToNextBoundary; + + // Copy data to chunk buffer + this.chunkBuf.set(data.subarray(0, consumeLen), this.curChunkLen - consumeLen); + } + + if (createChunk || (isFinal && this.curChunkLen > 0)) { + const chunkData = this.chunkBuf.subarray(0, this.curChunkLen); + const chunk: Chunk = { + length: chunkData.length, + hash: blake3Keyed(chunkData, BLAKE3_DATA_KEY), + }; + this.curChunkLen = 0; + this.hash = 0; + return { + chunk, + bytesConsumed: consumeLen, + }; + } + + return { + chunk: null, + bytesConsumed: consumeLen, + }; + } + + nextBlock(data: Uint8Array, isFinal: boolean): Chunk[] { + const chunks: Chunk[] = []; + let pos: i32 = 0; + + while (pos < data.length) { + const result = this.next(data.subarray(pos), isFinal); + if (result.chunk) { + // eslint-disable-next-line @typescript-eslint/no-non-null-assertion + chunks.push(result.chunk!); + } + pos += result.bytesConsumed; + } + + return chunks; + } + + finish(): Chunk | null { + return this.next(new Uint8Array(0), true).chunk; + } +} + +export function createChunker(targetChunkSize: i32 = TARGET_CHUNK_SIZE): XetChunker { + const chunker = new XetChunker(targetChunkSize); + + return chunker; +} + +export function nextBlock(chunker: XetChunker, data: Uint8Array): Chunk[] { + return chunker.nextBlock(data, false); +} + +export function finalize(chunker: XetChunker): Chunk | null { + return chunker.finish(); +} + +export function getChunks(data: Uint8Array, targetChunkSize: i32 = TARGET_CHUNK_SIZE): Chunk[] { + // console.log(`getChunks: ${targetChunkSize} ${data.length}`); + const chunker = createChunker(targetChunkSize); + return chunker.nextBlock(data, true); +} + +export function hashToHex(hash: Uint8Array): string { + const view = new DataView(hash.buffer); + const u64 = view.getUint64(0, true); + const u64_2 = view.getUint64(8, true); + const u64_3 = view.getUint64(16, true); + const u64_4 = view.getUint64(24, true); + + const hex = + u64.toString(16).padStart(16, "0") + + u64_2.toString(16).padStart(16, "0") + + u64_3.toString(16).padStart(16, "0") + + u64_4.toString(16).padStart(16, "0"); + return hex; +} diff --git a/packages/xetchunk-wasm/assembly/xorb-hash.ts b/packages/xetchunk-wasm/assembly/xorb-hash.ts new file mode 100644 index 0000000000..06a0c1069e --- /dev/null +++ b/packages/xetchunk-wasm/assembly/xorb-hash.ts @@ -0,0 +1,74 @@ +import { blake3Keyed } from "@huggingface/blake3-wasm/assembly"; +// eslint-disable-next-line @typescript-eslint/consistent-type-imports +import { Chunk } from "./xet-chunker"; + +const MEAN_CHUNK_PER_NODE = 4; + +// if (MEAN_CHUNK_PER_NODE % 256 !== 0) { +// throw new Error("MEAN_CHUNK_PER_NODE must be a multiple of 256"); +// // ^ So we only need to check the last byte of the last u64 in the chunk hash +// } + +const BLAKE3_NODE_KEY = new Uint8Array(32); +const STATIC_KEY: StaticArray = [ + 1, 126, 197, 199, 165, 71, 41, 150, 253, 148, 102, 102, 180, 138, 2, 230, 93, 221, 83, 111, 55, 199, 109, 210, 248, + 99, 82, 230, 74, 83, 113, 63, +]; +for (let i = 0; i < 32; i++) { + BLAKE3_NODE_KEY[i] = STATIC_KEY[i]; +} + +const INDEX_OF_LAST_BYTE_OF_LAST_U64_IN_CHUNK_HASH = 3 * 8; +// ^ 32 bytes, 8 bytes per u64, take the first byte of the last u64 due to little endianness +// ^ Assumes that MEAN_CHUNK_PER_NODE is a power of 2 and less than 256 + +export function xorbHash(chunks: Chunk[]): Uint8Array { + // Split chunks in groups of 2 - 2 * MEAN_CHUNK_PER_NODE with mean of MEAN_CHUNK_PER_NODE + // to form a tree of nodes + // Then recursively hash the groups + + if (chunks.length === 0) { + // Return empty hash for empty chunks array + return new Uint8Array(32); + } + + while (chunks.length > 1) { + const nodes: Chunk[] = []; + let currentIndex = 0; + let numOfChildrenSoFar = 0; + // ^ It's 1 less than it should be, propagating because of error in reference implementation + for (let i = 0; i < chunks.length; i++) { + if ( + i === chunks.length - 1 || + numOfChildrenSoFar === 2 * MEAN_CHUNK_PER_NODE || + (numOfChildrenSoFar >= 2 && + chunks[i].hash[INDEX_OF_LAST_BYTE_OF_LAST_U64_IN_CHUNK_HASH] % MEAN_CHUNK_PER_NODE === 0) + ) { + nodes.push(nodeHash(chunks.slice(currentIndex, i + 1))); + currentIndex = i + 1; + numOfChildrenSoFar = 0; + } else { + numOfChildrenSoFar++; + } + } + chunks = nodes; + } + + return nodeHash(chunks).hash; +} + +function nodeHash(chunks: Chunk[]): Chunk { + const array = new Uint8Array((32 + 8) * chunks.length); + const view = new DataView(array.buffer); + let totalLength = 0; + for (let i = 0; i < chunks.length; i++) { + array.set(chunks[i].hash, i * (32 + 8)); + view.setUint64(i * (32 + 8) + 32, chunks[i].length, true); + totalLength += chunks[i].length; + } + const hash = blake3Keyed(array, BLAKE3_NODE_KEY); + return { + hash: hash, + length: totalLength, + }; +} diff --git a/packages/xetchunk-wasm/build/.gitignore b/packages/xetchunk-wasm/build/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/xetchunk-wasm/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/xetchunk-wasm/package.json b/packages/xetchunk-wasm/package.json new file mode 100644 index 0000000000..16b19f8e6a --- /dev/null +++ b/packages/xetchunk-wasm/package.json @@ -0,0 +1,53 @@ +{ + "name": "@huggingface/xetchunk-wasm", + "version": "0.0.2", + "scripts": { + "build:debug": "asc assembly/index.ts --target debug", + "build:release": "asc assembly/index.ts --target release", + "build": "pnpm run build:debug && npm run build:release", + "test": "vitest run", + "bench": "node tests/bench.js", + "prepare": "pnpm run build" + }, + "keywords": [ + "xet", + "chunk", + "chunking", + "assemblyscript", + "assembly", + "wasm" + ], + "dependencies": { + "@huggingface/blake3-wasm": "workspace:*", + "@huggingface/gearhash-wasm": "workspace:*" + }, + "type": "module", + "exports": { + ".": { + "import": "./build/release.js", + "types": "./build/release.d.ts" + }, + "./assembly": { + "import": "./assembly/index.ts" + }, + "./wasm": { + "import": "./build/release.wasm" + } + }, + "main": "./build/release.js", + "types": "./build/release.d.ts", + "devDependencies": { + "@huggingface/splitmix64-wasm": "workspace:*", + "assemblyscript": "0.27.36" + }, + "files": [ + "build/release.js", + "build/release.d.ts", + "build/release.wasm", + "build/release.wat", + "build/release.wasm.map", + "README.md", + "asconfig.json", + "assembly" + ] +} diff --git a/packages/xetchunk-wasm/pnpm-lock.yaml b/packages/xetchunk-wasm/pnpm-lock.yaml new file mode 100644 index 0000000000..f6fba643b3 --- /dev/null +++ b/packages/xetchunk-wasm/pnpm-lock.yaml @@ -0,0 +1,48 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + dependencies: + '@huggingface/blake3-wasm': + specifier: workspace:* + version: link:../blake3-wasm + '@huggingface/gearhash-wasm': + specifier: workspace:* + version: link:../gearhash-wasm + devDependencies: + '@huggingface/splitmix64-wasm': + specifier: workspace:* + version: link:../splitmix64-wasm + assemblyscript: + specifier: 0.27.36 + version: 0.27.36 + +packages: + + assemblyscript@0.27.36: + resolution: {integrity: sha512-1qX2zf6p7l/mNYv8r21jC/Yft7kX7XKR3xUHw41zvV4xad5lyC8w7jZiwZBGoy64VKZLc+bTDJDWi8Kb70YrHA==} + engines: {node: '>=18', npm: '>=10'} + hasBin: true + + binaryen@116.0.0-nightly.20240114: + resolution: {integrity: sha512-0GZrojJnuhoe+hiwji7QFaL3tBlJoA+KFUN7ouYSDGZLSo9CKM8swQX8n/UcbR0d1VuZKU+nhogNzv423JEu5A==} + hasBin: true + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + +snapshots: + + assemblyscript@0.27.36: + dependencies: + binaryen: 116.0.0-nightly.20240114 + long: 5.3.2 + + binaryen@116.0.0-nightly.20240114: {} + + long@5.3.2: {} diff --git a/packages/xetchunk-wasm/tests/bench.js b/packages/xetchunk-wasm/tests/bench.js new file mode 100644 index 0000000000..736742138a --- /dev/null +++ b/packages/xetchunk-wasm/tests/bench.js @@ -0,0 +1,135 @@ +import { parseArgs } from "node:util"; +import { createChunker, finalize, nextBlock } from "../build/release.js"; +import { createReadStream } from "node:fs"; +import { Chunker } from "../vendor/chunker_wasm.js"; + +const { positionals } = parseArgs({ + args: process.argv.slice(2), + allowPositionals: true, +}); + +if (!positionals[0]) { + console.error("Usage: node tests/bench.js "); + process.exit(1); +} + +const BYTES = 100_000_000; +const CHUNK_SIZE = 10_000_000; + +console.log(`loading first ${BYTES.toLocaleString("en-US")} bytes of data in memory`); +const data = new Uint8Array(BYTES); + +const stream = createReadStream(positionals[0]); +let totalRead = 0; + +for await (const chunk of stream) { + data.set(chunk.slice(0, data.length - totalRead), totalRead); + totalRead += chunk.length; + + if (totalRead >= data.length) { + stream.close(); + break; + } +} + +if (totalRead < data.length) { + console.log("not enough data, repeating in memory"); + + while (totalRead < data.length) { + data.set(data.slice(0, BYTES), totalRead); + totalRead += BYTES; + } +} + +console.log( + `data loaded in memory, starting to process data ${CHUNK_SIZE.toLocaleString( + "en-US" + )} bytes at a time (for a max of 30 seconds)` +); + +function testAssemblyChunker() { + const start = performance.now(); + const chunker = createChunker(64 * 1024); + + let totalProcessed = 0; + let totalChunks = 0; + let stoppedEarly = false; + + for (let i = 0; i < data.length; i += CHUNK_SIZE) { + const chunks = nextBlock(chunker, data.subarray(i, i + CHUNK_SIZE)); + console.log("chunks", chunks.length); + totalProcessed += CHUNK_SIZE; + totalChunks += chunks.length; + + if (performance.now() - start > 30_000) { + console.log("30 seconds elapsed, stopping"); + stoppedEarly = true; + break; + } + } + + if (!stoppedEarly) { + const lastChunk = finalize(chunker); + if (lastChunk) { + totalChunks += 1; + totalProcessed = data.length; + } + } + + console.log( + `chunked ${totalChunks} chunks in ${performance.now() - start}ms, ${( + totalProcessed / + 1_000_000 / + ((performance.now() - start) / 1000) + ).toFixed(3)} MB/s` + ); +} + +testAssemblyChunker(); + +console.log("testing rust Chunker"); + +function testRustChunker() { + const start = performance.now(); + const chunker = new Chunker(64 * 1024); + + let totalProcessed = 0; + let totalChunks = 0; + let stoppedEarly = false; + + let chunks = []; + for (let i = 0; i < data.length; i += CHUNK_SIZE) { + chunks = chunker.add_data(data.subarray(i, i + CHUNK_SIZE)); + console.log("chunks", chunks.length); + totalProcessed += CHUNK_SIZE; + totalChunks += chunks.length; + + if (performance.now() - start > 30_000) { + console.log("30 seconds elapsed, stopping"); + stoppedEarly = true; + break; + } + } + + if (!stoppedEarly) { + chunks = chunker.finish(); + if (chunks.length > 0) { + totalChunks += chunks.length; + totalProcessed += chunks.length * chunks[0].length; + } + } + + console.log( + `chunked ${totalChunks} chunks in ${performance.now() - start}ms, ${( + totalProcessed / + 1_000_000 / + ((performance.now() - start) / 1000) + ).toFixed(3)} MB/s` + ); +} + +testRustChunker(); + +console.log("testing assembly Chunker again"); + +testAssemblyChunker(); diff --git a/packages/xetchunk-wasm/tests/index.test.ts b/packages/xetchunk-wasm/tests/index.test.ts new file mode 100644 index 0000000000..bd97e85844 --- /dev/null +++ b/packages/xetchunk-wasm/tests/index.test.ts @@ -0,0 +1,453 @@ +import { describe, it, expect } from "vitest"; +import { createChunker, finalize, nextBlock, getChunks, hashToHex } from "../build/debug.js"; +import { createRandomArray } from "@huggingface/splitmix64-wasm"; + +// Helper function to get chunk boundaries from chunks +function getChunkBoundaries(chunks: Array<{ length: number; hash: Uint8Array }>): number[] { + let pos = 0; + return chunks.map((chunk) => { + pos += chunk.length; + return pos; + }); +} + +describe("xetchunk-wasm", () => { + describe("Basic functionality with 1MB random data", () => { + it("should pass 1MB random data test", async () => { + // Create 1MB of random data with seed 0 + const dataBuffer = createRandomArray(1000000, 0); + const data = new Uint8Array(dataBuffer); + + // Verify specific byte values (from Rust reference) + expect(data[0]).toBe(175); + expect(data[127]).toBe(132); + expect(data[111111]).toBe(118); + + const referenceSha256 = "b3d0a1f7938cd4d8413a4dcffd4313e2e8ac0cb61cb1090eb140ea8e9154befb"; + const sha256 = await crypto.subtle.digest("SHA-256", data); + const sha256Hex = Array.from(new Uint8Array(sha256)) + .map((b) => b.toString(16).padStart(2, "0")) + .join(""); + expect(sha256Hex).toBe(referenceSha256); + + // Get chunks using the default chunker + const chunks = getChunks(data); + + // Get chunk boundaries + const chunkBoundaries = getChunkBoundaries(chunks); + + // Expected boundaries from Rust reference + const expectedBoundaries = [ + 84493, 134421, 144853, 243318, 271793, 336457, 467529, 494581, 582000, 596735, 616815, 653164, 678202, 724510, + 815591, 827760, 958832, 991092, 1000000, + ]; + + const expectedChunkHashes = [ + "6eca1e7dadaf08cca5d82d318c800f07c2ddcec115a7e8627e5edd9605b94b8d", + "624ea34d72a06e5d43a1b8dd10763f0d22f165992c397ed97563899b27fdd88a", + "4411d2ec847c6e3f451a7451ff3933adb4f1c31587421b9f730b698a78313b47", + "6342bde97433e29e0779ad33eb8040d986679040361b3cc3a06230fe60dd6c9b", + "405253fcf15bba751adc4f507d3453273daff81ed4d8acd71c521aa1cbddc0b5", + "4482374af7f8bebfdb5c5df0299f80128d6c58886ad7b218c562b1d74064e4cb", + "80acc8d39c853b4b8a8c6ad7b63bf2ea68f62c2226b92f06349f92cc84d213cc", + "a7076d7d343f711fb20fe6cd023248d8d051e8fe7d44172596cd5c7ea7edaf65", + "44755217bbb4dadc81ea7695765230a34a2e6cb3b55f373f1de35aeba79ae92c", + "001adc1d302d5f039278325dcbd5ec3b194f4794f1629d6f962f9f4bb78a7bff", + "f8460a337c186f07c2e225bb287a1d3d3d686dc69d0828e99640f7d8852c5b90", + "c9bc3da29025dc1ba562d303d815151d9a937367abb766ae842a165e8493d9fe", + "5044339dfd65e8163bdfe642614a6be604b04d6aeacf222cf219ad287bfc5cf1", + "163622db0fe0da93f2ef964eed4c485f3c7a9c312f8e8e8a312ab4bb8141f13e", + "e1730534a858aa0258ad8904ef12b829b8a123a1c611250275c9ca9471e4c650", + "1fbc6854f9185caba1e1f55393f41f83b895b18f9c99245c029025ca48f1e14b", + "8a27b66ccf05b864b6bef6fb5c970fe894f73d2330e8e98fe7841dcdbd9e9576", + "bc70a33e7a9ec820cac24b87023469a57bdae1bf91cc3961b95806c64a525221", + "03e5b5f5a088269ec4b329f1e04debfac4cb54b9c0facf038f7e8e0f054be7e2", + ]; + + expect(chunkBoundaries).toEqual(expectedBoundaries); + expect(chunks.map((chunk) => hashToHex(chunk.hash))).toEqual(expectedChunkHashes); + }); + }); + + describe("Constant data test", () => { + it("should pass 1MB constant data test", () => { + // Create 1MB of constant data (value 59) + const data = new Uint8Array(1000000); + data.fill(59); + + // Get chunks using the default chunker + const chunks = getChunks(data); + + // Get chunk boundaries + const chunkBoundaries = getChunkBoundaries(chunks); + + // Expected boundaries from Rust reference + const expectedBoundaries = [131072, 262144, 393216, 524288, 655360, 786432, 917504, 1000000]; + + expect(chunkBoundaries).toEqual(expectedBoundaries); + }); + }); + + describe("Chunk boundary consistency test", () => { + it("should maintain consistent chunk boundaries with different block sizes", () => { + // Create 256KB of random data with seed 1 + const dataBuffer = createRandomArray(256000, 1n); + const data = new Uint8Array(dataBuffer); + + // Get reference chunks using the default chunker + const refChunks = getChunks(data); + const refChunkBoundaries = getChunkBoundaries(refChunks); + + // Test with different block sizes + for (const addSize of [1, 37, 255]) { + const chunker = createChunker(); + const altChunks: Array<{ length: number; hash: Uint8Array }> = []; + + let pos = 0; + while (pos < data.length) { + const nextPos = Math.min(pos + addSize, data.length); + const nextChunk = nextBlock(chunker, data.subarray(pos, nextPos)); + altChunks.push(...nextChunk); + pos = nextPos; + } + + // Finalize to get any remaining chunk + const finalChunk = finalize(chunker); + if (finalChunk) { + altChunks.push(finalChunk); + } + + const altBoundaries = getChunkBoundaries(altChunks); + expect(altBoundaries).toEqual(refChunkBoundaries); + } + }); + }); + + describe("Triggering data test", () => { + it("should handle triggering data patterns correctly", () => { + // Create a pattern that triggers boundary detection + // This is a simplified version of the Rust test + function get_triggering_base_data(n: number, padding: number) { + const pattern = new Uint8Array([ + 154, 52, 42, 34, 159, 75, 126, 224, 70, 236, 12, 196, 79, 236, 178, 124, 127, 50, 99, 178, 44, 176, 174, 126, + 250, 235, 205, 174, 252, 122, 35, 10, 20, 101, 214, 69, 193, 8, 115, 105, 158, 228, 120, 111, 136, 162, 198, + 251, 211, 183, 253, 252, 164, 147, 63, 16, 186, 162, 117, 23, 170, 36, 205, 187, 174, 76, 210, 174, 211, 175, + 12, 173, 145, 59, 2, 70, 222, 181, 159, 227, 182, 156, 189, 51, 226, 106, 24, 50, 183, 157, 140, 10, 8, 23, + 212, 70, 10, 234, 23, 33, 219, 254, 39, 236, 70, 49, 191, 116, 9, 115, 15, 101, 26, 159, 165, 220, 15, 170, + 56, 125, 92, 163, 94, 235, 38, 40, 49, 81, + ]); + + // Create 64KB of data by repeating the pattern + const data = new Uint8Array(n); + + for (let i = 0; i < n; i += pattern.length + padding) { + data.set(pattern.slice(0, Math.min(pattern.length, n - i)), i); + } + + return data; + } + + const data_sample_at_11111 = new Uint8Array(128); + const ref_cb = new Array(128); + + data_sample_at_11111[0] = 236; + ref_cb[0] = [8256, 16448, 24640, 32832, 41024, 49216, 57408, 65536]; + data_sample_at_11111[1] = 50; + ref_cb[1] = [8191, 16447, 24703, 32959, 41215, 49471, 57727, 65536]; + data_sample_at_11111[2] = 36; + ref_cb[2] = [8254, 16574, 24894, 33214, 41534, 49854, 58174, 65536]; + data_sample_at_11111[3] = 116; + ref_cb[3] = [8317, 16570, 24823, 33076, 41329, 49582, 57835, 65536]; + data_sample_at_11111[4] = 126; + ref_cb[4] = [8248, 16564, 24880, 33196, 41512, 49828, 58144, 65536]; + data_sample_at_11111[5] = 145; + ref_cb[5] = [8310, 16556, 24802, 33048, 41294, 49540, 57786, 65536]; + data_sample_at_11111[6] = 235; + ref_cb[6] = [8238, 16546, 24854, 33162, 41470, 49778, 58086, 65536]; + data_sample_at_11111[7] = 228; + ref_cb[7] = [8299, 16534, 24769, 33004, 41239, 49474, 57709, 65536]; + data_sample_at_11111[8] = 70; + ref_cb[8] = [8224, 16520, 24816, 33112, 41408, 49704, 58000, 65536]; + data_sample_at_11111[9] = 178; + ref_cb[9] = [8284, 16504, 24724, 32944, 41164, 49384, 57604, 65536]; + data_sample_at_11111[10] = 173; + ref_cb[10] = [8206, 16486, 24766, 33046, 41326, 49606, 57886, 65536]; + data_sample_at_11111[11] = 0; + ref_cb[11] = [8265, 16466, 24667, 32868, 41069, 49270, 57471, 65536]; + data_sample_at_11111[12] = 252; + ref_cb[12] = [8324, 16452, 24704, 32832, 41084, 49212, 57464, 65536]; + data_sample_at_11111[13] = 159; + ref_cb[13] = [8242, 16561, 24880, 33199, 41518, 49837, 58156, 65536]; + data_sample_at_11111[14] = 69; + ref_cb[14] = [8300, 16536, 24772, 33008, 41244, 49480, 57716, 65536]; + data_sample_at_11111[15] = 219; + ref_cb[15] = [8215, 16509, 24803, 33097, 41391, 49685, 57979, 65536]; + data_sample_at_11111[16] = 126; + ref_cb[16] = [8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536]; + data_sample_at_11111[17] = 10; + ref_cb[17] = [8329, 16457, 24714, 32842, 41099, 49227, 57484, 65536]; + data_sample_at_11111[18] = 124; + ref_cb[18] = [8240, 16562, 24884, 33206, 41528, 49850, 58172, 65536]; + data_sample_at_11111[19] = 24; + ref_cb[19] = [8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536]; + data_sample_at_11111[20] = 196; + ref_cb[20] = [8204, 16492, 24780, 33068, 41356, 49644, 57932, 65536]; + data_sample_at_11111[21] = 106; + ref_cb[21] = [8259, 16454, 24649, 32844, 41039, 49234, 57429, 65536]; + data_sample_at_11111[22] = 196; + ref_cb[22] = [8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536]; + data_sample_at_11111[23] = 183; + ref_cb[23] = [8218, 16523, 24828, 33133, 41438, 49743, 58048, 65536]; + data_sample_at_11111[24] = 124; + ref_cb[24] = [8128, 16328, 24536, 32744, 40952, 49160, 57368, 65536]; + data_sample_at_11111[25] = 70; + ref_cb[25] = [8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536]; + data_sample_at_11111[26] = 126; + ref_cb[26] = [8226, 16542, 24858, 33174, 41490, 49806, 58122, 65536]; + data_sample_at_11111[27] = 191; + ref_cb[27] = [8279, 16494, 24709, 32924, 41139, 49354, 57569, 65536]; + data_sample_at_11111[28] = 69; + ref_cb[28] = [8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536]; + data_sample_at_11111[29] = 163; + ref_cb[29] = [8128, 16392, 24713, 33034, 41355, 49676, 57997, 65536]; + data_sample_at_11111[30] = 252; + ref_cb[30] = [8280, 16496, 24712, 32928, 41144, 49360, 57576, 65536]; + data_sample_at_11111[31] = 0; + ref_cb[31] = [8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536]; + data_sample_at_11111[32] = 173; + ref_cb[32] = [8224, 16544, 24864, 33184, 41504, 49824, 58144, 65536]; + data_sample_at_11111[33] = 42; + ref_cb[33] = [8275, 16486, 24697, 32908, 41119, 49330, 57541, 65536]; + data_sample_at_11111[34] = 70; + ref_cb[34] = [8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536]; + data_sample_at_11111[35] = 174; + ref_cb[35] = [8214, 16527, 24840, 33153, 41466, 49779, 58092, 65536]; + data_sample_at_11111[36] = 235; + ref_cb[36] = [8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536]; + data_sample_at_11111[37] = 186; + ref_cb[37] = [8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536]; + data_sample_at_11111[38] = 0; + ref_cb[38] = [8198, 16498, 24798, 33098, 41398, 49698, 57998, 65536]; + data_sample_at_11111[39] = 157; + ref_cb[39] = [8247, 16597, 24947, 33297, 41647, 49997, 58347, 65536]; + data_sample_at_11111[40] = 126; + ref_cb[40] = [8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536]; + data_sample_at_11111[41] = 49; + ref_cb[41] = [8345, 16626, 24907, 33188, 41469, 49750, 58031, 65536]; + data_sample_at_11111[42] = 36; + ref_cb[42] = [8224, 16554, 24884, 33214, 41544, 49874, 58204, 65536]; + data_sample_at_11111[43] = 0; + ref_cb[43] = [8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536]; + data_sample_at_11111[44] = 236; + ref_cb[44] = [8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536]; + data_sample_at_11111[45] = 105; + ref_cb[45] = [8195, 16499, 24803, 33107, 41411, 49715, 58019, 65536]; + data_sample_at_11111[46] = 0; + ref_cb[46] = [8242, 16594, 24946, 33298, 41650, 50002, 58354, 65536]; + data_sample_at_11111[47] = 24; + ref_cb[47] = [8289, 16514, 24739, 32964, 41189, 49414, 57639, 65536]; + data_sample_at_11111[48] = 126; + ref_cb[48] = [8336, 16608, 24880, 33152, 41424, 49696, 57968, 65536]; + data_sample_at_11111[49] = 0; + ref_cb[49] = [8206, 16525, 24844, 33163, 41482, 49801, 58120, 65536]; + data_sample_at_11111[50] = 70; + ref_cb[50] = [8252, 16618, 24984, 33350, 41716, 50082, 58448, 65536]; + data_sample_at_11111[51] = 236; + ref_cb[51] = [8298, 16532, 24766, 33000, 41234, 49468, 57702, 65536]; + data_sample_at_11111[52] = 0; + ref_cb[52] = [8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536]; + data_sample_at_11111[53] = 12; + ref_cb[53] = [8209, 16337, 24680, 32808, 41151, 49279, 57622, 65536]; + data_sample_at_11111[54] = 236; + ref_cb[54] = [8254, 16626, 24998, 33370, 41742, 50114, 58486, 65536]; + data_sample_at_11111[55] = 0; + ref_cb[55] = [8299, 16534, 24769, 33004, 41239, 49474, 57709, 65536]; + data_sample_at_11111[56] = 173; + ref_cb[56] = [8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536]; + data_sample_at_11111[57] = 196; + ref_cb[57] = [8204, 16529, 24854, 33179, 41504, 49829, 58154, 65536]; + data_sample_at_11111[58] = 0; + ref_cb[58] = [8248, 16618, 24988, 33358, 41728, 50098, 58468, 65536]; + data_sample_at_11111[59] = 159; + ref_cb[59] = [8292, 16520, 24748, 32976, 41204, 49432, 57660, 65536]; + data_sample_at_11111[60] = 178; + ref_cb[60] = [8336, 16608, 24880, 33152, 41424, 49696, 57968, 65536]; + data_sample_at_11111[61] = 0; + ref_cb[61] = [8191, 16507, 24823, 33139, 41455, 49771, 58087, 65536]; + data_sample_at_11111[62] = 10; + ref_cb[62] = [8234, 16594, 24954, 33314, 41674, 50034, 58394, 65536]; + data_sample_at_11111[63] = 101; + ref_cb[63] = [8277, 16490, 24703, 32916, 41129, 49342, 57555, 65536]; + data_sample_at_11111[64] = 0; + ref_cb[64] = [8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536]; + data_sample_at_11111[65] = 15; + ref_cb[65] = [8363, 16662, 24961, 33260, 41559, 49858, 58157, 65536]; + data_sample_at_11111[66] = 147; + ref_cb[66] = [8212, 16554, 24896, 33238, 41580, 49922, 58264, 65536]; + data_sample_at_11111[67] = 0; + ref_cb[67] = [8254, 16639, 25024, 33409, 41794, 50179, 58564, 65536]; + data_sample_at_11111[68] = 0; + ref_cb[68] = [8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536]; + data_sample_at_11111[69] = 227; + ref_cb[69] = [8338, 16612, 24886, 33160, 41434, 49708, 57982, 65536]; + data_sample_at_11111[70] = 126; + ref_cb[70] = [8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536]; + data_sample_at_11111[71] = 0; + ref_cb[71] = [8223, 16581, 24939, 33297, 41655, 50013, 58371, 65536]; + data_sample_at_11111[72] = 101; + ref_cb[72] = [8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536]; + data_sample_at_11111[73] = 186; + ref_cb[73] = [8305, 16546, 24787, 33028, 41269, 49510, 57751, 65536]; + data_sample_at_11111[74] = 52; + ref_cb[74] = [8346, 16628, 24910, 33192, 41474, 49756, 58038, 65536]; + data_sample_at_11111[75] = 0; + ref_cb[75] = [8387, 16515, 24830, 32958, 41273, 49401, 57716, 65536]; + data_sample_at_11111[76] = 70; + ref_cb[76] = [8224, 16588, 24952, 33316, 41680, 50044, 58408, 65536]; + data_sample_at_11111[77] = 228; + ref_cb[77] = [8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536]; + data_sample_at_11111[78] = 0; + ref_cb[78] = [8128, 16338, 24578, 32818, 41058, 49298, 57538, 65536]; + data_sample_at_11111[79] = 0; + ref_cb[79] = [8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536]; + data_sample_at_11111[80] = 50; + ref_cb[80] = [8384, 16704, 25024, 33344, 41664, 49984, 58304, 65536]; + data_sample_at_11111[81] = 214; + ref_cb[81] = [8215, 16575, 24935, 33295, 41655, 50015, 58375, 65536]; + data_sample_at_11111[82] = 0; + ref_cb[82] = [8254, 16654, 25054, 33454, 41854, 50254, 58654, 65536]; + data_sample_at_11111[83] = 0; + ref_cb[83] = [8293, 16522, 24751, 32980, 41209, 49438, 57667, 65536]; + data_sample_at_11111[84] = 50; + ref_cb[84] = [8128, 16388, 24656, 32924, 41192, 49460, 57728, 65536]; + data_sample_at_11111[85] = 69; + ref_cb[85] = [8371, 16678, 24985, 33292, 41599, 49906, 58213, 65536]; + data_sample_at_11111[86] = 0; + ref_cb[86] = [8196, 16324, 24674, 32802, 41152, 49280, 57630, 65536]; + data_sample_at_11111[87] = 0; + ref_cb[87] = [8234, 16619, 25004, 33389, 41774, 50159, 58544, 65536]; + data_sample_at_11111[88] = 70; + ref_cb[88] = [8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536]; + data_sample_at_11111[89] = 136; + ref_cb[89] = [8128, 16339, 24585, 32831, 41077, 49323, 57569, 65536]; + data_sample_at_11111[90] = 0; + ref_cb[90] = [8348, 16632, 24916, 33200, 41484, 49768, 58052, 65536]; + data_sample_at_11111[91] = 0; + ref_cb[91] = [8386, 16708, 25030, 33352, 41674, 49996, 58318, 65536]; + data_sample_at_11111[92] = 101; + ref_cb[92] = [8204, 16564, 24924, 33284, 41644, 50004, 58364, 65536]; + data_sample_at_11111[93] = 36; + ref_cb[93] = [8241, 16639, 25037, 33435, 41833, 50231, 58629, 65536]; + data_sample_at_11111[94] = 196; + ref_cb[94] = [8278, 16492, 24706, 32920, 41134, 49348, 57562, 65536]; + data_sample_at_11111[95] = 0; + ref_cb[95] = [8315, 16566, 24817, 33068, 41319, 49570, 57821, 65536]; + data_sample_at_11111[96] = 0; + ref_cb[96] = [8352, 16640, 24928, 33216, 41504, 49792, 58080, 65536]; + data_sample_at_11111[97] = 24; + ref_cb[97] = [8389, 16714, 25039, 33364, 41689, 50014, 58339, 65536]; + data_sample_at_11111[98] = 8; + ref_cb[98] = [8200, 16562, 24924, 33286, 41648, 50010, 58372, 65536]; + data_sample_at_11111[99] = 0; + ref_cb[99] = [8236, 16635, 25034, 33433, 41832, 50231, 58630, 65536]; + data_sample_at_11111[100] = 0; + ref_cb[100] = [8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536]; + data_sample_at_11111[101] = 125; + ref_cb[101] = [8308, 16552, 24796, 33040, 41284, 49528, 57772, 65536]; + data_sample_at_11111[102] = 173; + ref_cb[102] = [8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536]; + data_sample_at_11111[103] = 126; + ref_cb[103] = [8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536]; + data_sample_at_11111[104] = 0; + ref_cb[104] = [8416, 16544, 24888, 33016, 41360, 49488, 57832, 65536]; + data_sample_at_11111[105] = 0; + ref_cb[105] = [8219, 16607, 24995, 33383, 41771, 50159, 58547, 65536]; + data_sample_at_11111[106] = 159; + ref_cb[106] = [8254, 16678, 25102, 33526, 41950, 50374, 58798, 65536]; + data_sample_at_11111[107] = 210; + ref_cb[107] = [8289, 16514, 24739, 32964, 41189, 49414, 57639, 65536]; + data_sample_at_11111[108] = 178; + ref_cb[108] = [8324, 16584, 24844, 33104, 41364, 49624, 57884, 65536]; + data_sample_at_11111[109] = 0; + ref_cb[109] = [8359, 16654, 24949, 33244, 41539, 49834, 58129, 65536]; + data_sample_at_11111[110] = 0; + ref_cb[110] = [8394, 16724, 25054, 33384, 41714, 50044, 58374, 65536]; + data_sample_at_11111[111] = 170; + ref_cb[111] = [8429, 16794, 25159, 33524, 41889, 50254, 58619, 65536]; + data_sample_at_11111[112] = 173; + ref_cb[112] = [8224, 16624, 25024, 33424, 41824, 50224, 58624, 65536]; + data_sample_at_11111[113] = 235; + ref_cb[113] = [8258, 16452, 24646, 32840, 41034, 49228, 57422, 65536]; + data_sample_at_11111[114] = 0; + ref_cb[114] = [8292, 16520, 24748, 32976, 41204, 49432, 57660, 65536]; + data_sample_at_11111[115] = 0; + ref_cb[115] = [8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536]; + data_sample_at_11111[116] = 0; + ref_cb[116] = [8360, 16656, 24952, 33248, 41544, 49840, 58136, 65536]; + data_sample_at_11111[117] = 24; + ref_cb[117] = [8394, 16724, 25054, 33384, 41714, 50044, 58374, 65536]; + data_sample_at_11111[118] = 228; + ref_cb[118] = [8428, 16792, 25156, 33520, 41884, 50248, 58612, 65536]; + data_sample_at_11111[119] = 0; + ref_cb[119] = [8215, 16613, 25011, 33409, 41807, 50205, 58603, 65536]; + data_sample_at_11111[120] = 0; + ref_cb[120] = [8248, 16680, 25112, 33544, 41976, 50408, 58840, 65536]; + data_sample_at_11111[121] = 0; + ref_cb[121] = [8281, 16498, 24715, 32932, 41149, 49366, 57583, 65536]; + data_sample_at_11111[122] = 101; + ref_cb[122] = [8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536]; + data_sample_at_11111[123] = 174; + ref_cb[123] = [8347, 16630, 24913, 33196, 41479, 49762, 58045, 65536]; + data_sample_at_11111[124] = 126; + ref_cb[124] = [8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536]; + data_sample_at_11111[125] = 0; + ref_cb[125] = [8413, 16762, 25111, 33460, 41809, 50158, 58507, 65536]; + data_sample_at_11111[126] = 0; + ref_cb[126] = [8192, 16574, 24956, 33338, 41720, 50102, 58484, 65536]; + data_sample_at_11111[127] = 0; + ref_cb[127] = [8224, 16639, 25054, 33469, 41884, 50299, 58714, 65536]; + + // Now run the loop with this reference data. + + for (let i = 0; i < 128; i++) { + console.log(`Running test case ${i}`); + const data = get_triggering_base_data(65536, i); + + expect(data[11111]).toBe(data_sample_at_11111[i]); + + const chunks = getChunks(data); + const chunkBoundaries = getChunkBoundaries(chunks); + + expect(chunkBoundaries).toEqual(ref_cb[i]); + } + }); + }); + + describe("Basic chunker functionality", () => { + it("should create and use chunker correctly", () => { + // Create a small test data + const data = new Uint8Array(100000); + for (let i = 0; i < data.length; i++) { + data[i] = Math.floor(Math.random() * 256); + } + + // Test chunker creation and usage + const chunker = createChunker(); + const chunks = nextBlock(chunker, data); + const finalChunk = finalize(chunker); + + // Verify chunks have the expected structure + for (const chunk of chunks) { + expect(typeof chunk.length).toBe("number"); + expect(typeof chunk.hash).toBe("object"); + expect(chunk.hash instanceof Uint8Array).toBe(true); + } + + if (finalChunk) { + expect(typeof finalChunk.length).toBe("number"); + expect(typeof finalChunk.hash).toBe("object"); + expect(finalChunk.hash instanceof Uint8Array).toBe(true); + } + }); + }); +}); diff --git a/packages/xetchunk-wasm/tests/reference.rs b/packages/xetchunk-wasm/tests/reference.rs new file mode 100644 index 0000000000..8fbee5a6b3 --- /dev/null +++ b/packages/xetchunk-wasm/tests/reference.rs @@ -0,0 +1,406 @@ + +fn get_chunk_boundaries(chunks: &[Chunk]) -> Vec { + chunks + .iter() + .scan(0, |state, chunk| { + *state += chunk.data.len(); + Some(*state) + }) + .collect() +} + +#[test] +fn test_chunk_boundaries() { + let data = create_random_data(256000, 1); + + // Now, run the chunks through the default chunker. + let chunks = Chunker::default().next_block(&data, true); + + // Get the boundaries indices as determined by the size of the chunks above. + let ref_chunk_boundaries: Vec = get_chunk_boundaries(&chunks); + + // Test that it's correct across different chunk varieties. + for add_size in [1, 37, 255] { + let mut chunker = Chunker::default(); + + // Add repeatedly in blocks of add_size, appending to alt_chunks + let mut alt_chunks = Vec::with_capacity(chunks.len()); + + let mut pos = 0; + while pos < data.len() { + let next_pos = (pos + add_size).min(data.len()); + let next_chunk = chunker.next_block(&data[pos..next_pos], next_pos == data.len()); + alt_chunks.extend(next_chunk); + pos = next_pos; + } + + let alt_boundaries = get_chunk_boundaries(&alt_chunks); + + assert_eq!(alt_boundaries, ref_chunk_boundaries); + } +} + +#[test] +fn test_correctness_1mb_random_data() { + // Test this data. + let data = create_random_data(1000000, 0); + + // Uncomment these to create the lines below: + // eprintln!("(data[0], {});", data[0] as usize); + // eprintln!("(data[127], {});", data[127] as usize); + // eprintln!("(data[111111], {});", data[111111] as usize); + + assert_eq!(data[0], 175); + assert_eq!(data[127], 132); + assert_eq!(data[111111], 118); + + // Now, run the chunks through the default chunker. + let chunks = Chunker::default().next_block(&data, true); + + // Get the boundaries indices as determined by the size of the chunks above. + let chunk_boundaries: Vec = get_chunk_boundaries(&chunks); + + // Uncomment this to create the line below. + // eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})"); + assert_eq!( + chunk_boundaries, + vec![ + 84493, 134421, 144853, 243318, 271793, 336457, 467529, 494581, 582000, 596735, 616815, 653164, 678202, + 724510, 815591, 827760, 958832, 991092, 1000000 + ] + ); +} + +#[test] +fn test_correctness_1mb_const_data() { + // Test this data. + let data = vec![59u8; 1000000]; + + // Now, run the chunks through the default chunker. + let chunks = Chunker::default().next_block(&data, true); + + // Get the boundaries indices as determined by the size of the chunks above. + let chunk_boundaries: Vec = get_chunk_boundaries(&chunks); + + // Uncomment this to create the line below. + // eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})"); + assert_eq!(chunk_boundaries, vec![131072, 262144, 393216, 524288, 655360, 786432, 917504, 1000000]) +} + +fn get_triggering_base_data(n: usize, padding: usize) -> Vec { + // This pattern is known to trigger the boundary detection in the chunker, so repeat it to test the + // correctness of the minimum chunk size processing. + let mut data = vec![ + 154, 52, 42, 34, 159, 75, 126, 224, 70, 236, 12, 196, 79, 236, 178, 124, 127, 50, 99, 178, 44, 176, 174, + 126, 250, 235, 205, 174, 252, 122, 35, 10, 20, 101, 214, 69, 193, 8, 115, 105, 158, 228, 120, 111, 136, + 162, 198, 251, 211, 183, 253, 252, 164, 147, 63, 16, 186, 162, 117, 23, 170, 36, 205, 187, 174, 76, 210, + 174, 211, 175, 12, 173, 145, 59, 2, 70, 222, 181, 159, 227, 182, 156, 189, 51, 226, 106, 24, 50, 183, 157, + 140, 10, 8, 23, 212, 70, 10, 234, 23, 33, 219, 254, 39, 236, 70, 49, 191, 116, 9, 115, 15, 101, 26, 159, + 165, 220, 15, 170, 56, 125, 92, 163, 94, 235, 38, 40, 49, 81, + ]; + + // Add padding so we can comprehensively test the nuances of boundaries. + data.resize(data.len() + padding, 0u8); + + // Repeat the above pattern until we've filled out n bytes. + while data.len() < n { + let n_take = (n - data.len()).min(data.len()); + data.extend_from_within(0..n_take); + } + + data +} + +#[test] +fn test_correctness_100kb_hitting_data() { + // To ensure we've checked all the nuances of dealing with minimum chunk boundaries, + // and with the correct chunks as well, run through all the different options with the padding, + // checking each one. With this, then, we have a pattern that hits once per pattern with varying + // bits between the widths. + + let mut data_sample_at_11111 = [0u8; 128]; + let mut ref_cb = vec![Vec::new(); 128]; + + data_sample_at_11111[0] = 236; + ref_cb[0] = vec![8256, 16448, 24640, 32832, 41024, 49216, 57408, 65536]; + data_sample_at_11111[1] = 50; + ref_cb[1] = vec![8191, 16447, 24703, 32959, 41215, 49471, 57727, 65536]; + data_sample_at_11111[2] = 36; + ref_cb[2] = vec![8254, 16574, 24894, 33214, 41534, 49854, 58174, 65536]; + data_sample_at_11111[3] = 116; + ref_cb[3] = vec![8317, 16570, 24823, 33076, 41329, 49582, 57835, 65536]; + data_sample_at_11111[4] = 126; + ref_cb[4] = vec![8248, 16564, 24880, 33196, 41512, 49828, 58144, 65536]; + data_sample_at_11111[5] = 145; + ref_cb[5] = vec![8310, 16556, 24802, 33048, 41294, 49540, 57786, 65536]; + data_sample_at_11111[6] = 235; + ref_cb[6] = vec![8238, 16546, 24854, 33162, 41470, 49778, 58086, 65536]; + data_sample_at_11111[7] = 228; + ref_cb[7] = vec![8299, 16534, 24769, 33004, 41239, 49474, 57709, 65536]; + data_sample_at_11111[8] = 70; + ref_cb[8] = vec![8224, 16520, 24816, 33112, 41408, 49704, 58000, 65536]; + data_sample_at_11111[9] = 178; + ref_cb[9] = vec![8284, 16504, 24724, 32944, 41164, 49384, 57604, 65536]; + data_sample_at_11111[10] = 173; + ref_cb[10] = vec![8206, 16486, 24766, 33046, 41326, 49606, 57886, 65536]; + data_sample_at_11111[11] = 0; + ref_cb[11] = vec![8265, 16466, 24667, 32868, 41069, 49270, 57471, 65536]; + data_sample_at_11111[12] = 252; + ref_cb[12] = vec![8324, 16452, 24704, 32832, 41084, 49212, 57464, 65536]; + data_sample_at_11111[13] = 159; + ref_cb[13] = vec![8242, 16561, 24880, 33199, 41518, 49837, 58156, 65536]; + data_sample_at_11111[14] = 69; + ref_cb[14] = vec![8300, 16536, 24772, 33008, 41244, 49480, 57716, 65536]; + data_sample_at_11111[15] = 219; + ref_cb[15] = vec![8215, 16509, 24803, 33097, 41391, 49685, 57979, 65536]; + data_sample_at_11111[16] = 126; + ref_cb[16] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536]; + data_sample_at_11111[17] = 10; + ref_cb[17] = vec![8329, 16457, 24714, 32842, 41099, 49227, 57484, 65536]; + data_sample_at_11111[18] = 124; + ref_cb[18] = vec![8240, 16562, 24884, 33206, 41528, 49850, 58172, 65536]; + data_sample_at_11111[19] = 24; + ref_cb[19] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536]; + data_sample_at_11111[20] = 196; + ref_cb[20] = vec![8204, 16492, 24780, 33068, 41356, 49644, 57932, 65536]; + data_sample_at_11111[21] = 106; + ref_cb[21] = vec![8259, 16454, 24649, 32844, 41039, 49234, 57429, 65536]; + data_sample_at_11111[22] = 196; + ref_cb[22] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536]; + data_sample_at_11111[23] = 183; + ref_cb[23] = vec![8218, 16523, 24828, 33133, 41438, 49743, 58048, 65536]; + data_sample_at_11111[24] = 124; + ref_cb[24] = vec![8128, 16328, 24536, 32744, 40952, 49160, 57368, 65536]; + data_sample_at_11111[25] = 70; + ref_cb[25] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536]; + data_sample_at_11111[26] = 126; + ref_cb[26] = vec![8226, 16542, 24858, 33174, 41490, 49806, 58122, 65536]; + data_sample_at_11111[27] = 191; + ref_cb[27] = vec![8279, 16494, 24709, 32924, 41139, 49354, 57569, 65536]; + data_sample_at_11111[28] = 69; + ref_cb[28] = vec![8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536]; + data_sample_at_11111[29] = 163; + ref_cb[29] = vec![8128, 16392, 24713, 33034, 41355, 49676, 57997, 65536]; + data_sample_at_11111[30] = 252; + ref_cb[30] = vec![8280, 16496, 24712, 32928, 41144, 49360, 57576, 65536]; + data_sample_at_11111[31] = 0; + ref_cb[31] = vec![8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536]; + data_sample_at_11111[32] = 173; + ref_cb[32] = vec![8224, 16544, 24864, 33184, 41504, 49824, 58144, 65536]; + data_sample_at_11111[33] = 42; + ref_cb[33] = vec![8275, 16486, 24697, 32908, 41119, 49330, 57541, 65536]; + data_sample_at_11111[34] = 70; + ref_cb[34] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536]; + data_sample_at_11111[35] = 174; + ref_cb[35] = vec![8214, 16527, 24840, 33153, 41466, 49779, 58092, 65536]; + data_sample_at_11111[36] = 235; + ref_cb[36] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536]; + data_sample_at_11111[37] = 186; + ref_cb[37] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536]; + data_sample_at_11111[38] = 0; + ref_cb[38] = vec![8198, 16498, 24798, 33098, 41398, 49698, 57998, 65536]; + data_sample_at_11111[39] = 157; + ref_cb[39] = vec![8247, 16597, 24947, 33297, 41647, 49997, 58347, 65536]; + data_sample_at_11111[40] = 126; + ref_cb[40] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536]; + data_sample_at_11111[41] = 49; + ref_cb[41] = vec![8345, 16626, 24907, 33188, 41469, 49750, 58031, 65536]; + data_sample_at_11111[42] = 36; + ref_cb[42] = vec![8224, 16554, 24884, 33214, 41544, 49874, 58204, 65536]; + data_sample_at_11111[43] = 0; + ref_cb[43] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536]; + data_sample_at_11111[44] = 236; + ref_cb[44] = vec![8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536]; + data_sample_at_11111[45] = 105; + ref_cb[45] = vec![8195, 16499, 24803, 33107, 41411, 49715, 58019, 65536]; + data_sample_at_11111[46] = 0; + ref_cb[46] = vec![8242, 16594, 24946, 33298, 41650, 50002, 58354, 65536]; + data_sample_at_11111[47] = 24; + ref_cb[47] = vec![8289, 16514, 24739, 32964, 41189, 49414, 57639, 65536]; + data_sample_at_11111[48] = 126; + ref_cb[48] = vec![8336, 16608, 24880, 33152, 41424, 49696, 57968, 65536]; + data_sample_at_11111[49] = 0; + ref_cb[49] = vec![8206, 16525, 24844, 33163, 41482, 49801, 58120, 65536]; + data_sample_at_11111[50] = 70; + ref_cb[50] = vec![8252, 16618, 24984, 33350, 41716, 50082, 58448, 65536]; + data_sample_at_11111[51] = 236; + ref_cb[51] = vec![8298, 16532, 24766, 33000, 41234, 49468, 57702, 65536]; + data_sample_at_11111[52] = 0; + ref_cb[52] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536]; + data_sample_at_11111[53] = 12; + ref_cb[53] = vec![8209, 16337, 24680, 32808, 41151, 49279, 57622, 65536]; + data_sample_at_11111[54] = 236; + ref_cb[54] = vec![8254, 16626, 24998, 33370, 41742, 50114, 58486, 65536]; + data_sample_at_11111[55] = 0; + ref_cb[55] = vec![8299, 16534, 24769, 33004, 41239, 49474, 57709, 65536]; + data_sample_at_11111[56] = 173; + ref_cb[56] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536]; + data_sample_at_11111[57] = 196; + ref_cb[57] = vec![8204, 16529, 24854, 33179, 41504, 49829, 58154, 65536]; + data_sample_at_11111[58] = 0; + ref_cb[58] = vec![8248, 16618, 24988, 33358, 41728, 50098, 58468, 65536]; + data_sample_at_11111[59] = 159; + ref_cb[59] = vec![8292, 16520, 24748, 32976, 41204, 49432, 57660, 65536]; + data_sample_at_11111[60] = 178; + ref_cb[60] = vec![8336, 16608, 24880, 33152, 41424, 49696, 57968, 65536]; + data_sample_at_11111[61] = 0; + ref_cb[61] = vec![8191, 16507, 24823, 33139, 41455, 49771, 58087, 65536]; + data_sample_at_11111[62] = 10; + ref_cb[62] = vec![8234, 16594, 24954, 33314, 41674, 50034, 58394, 65536]; + data_sample_at_11111[63] = 101; + ref_cb[63] = vec![8277, 16490, 24703, 32916, 41129, 49342, 57555, 65536]; + data_sample_at_11111[64] = 0; + ref_cb[64] = vec![8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536]; + data_sample_at_11111[65] = 15; + ref_cb[65] = vec![8363, 16662, 24961, 33260, 41559, 49858, 58157, 65536]; + data_sample_at_11111[66] = 147; + ref_cb[66] = vec![8212, 16554, 24896, 33238, 41580, 49922, 58264, 65536]; + data_sample_at_11111[67] = 0; + ref_cb[67] = vec![8254, 16639, 25024, 33409, 41794, 50179, 58564, 65536]; + data_sample_at_11111[68] = 0; + ref_cb[68] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536]; + data_sample_at_11111[69] = 227; + ref_cb[69] = vec![8338, 16612, 24886, 33160, 41434, 49708, 57982, 65536]; + data_sample_at_11111[70] = 126; + ref_cb[70] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536]; + data_sample_at_11111[71] = 0; + ref_cb[71] = vec![8223, 16581, 24939, 33297, 41655, 50013, 58371, 65536]; + data_sample_at_11111[72] = 101; + ref_cb[72] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536]; + data_sample_at_11111[73] = 186; + ref_cb[73] = vec![8305, 16546, 24787, 33028, 41269, 49510, 57751, 65536]; + data_sample_at_11111[74] = 52; + ref_cb[74] = vec![8346, 16628, 24910, 33192, 41474, 49756, 58038, 65536]; + data_sample_at_11111[75] = 0; + ref_cb[75] = vec![8387, 16515, 24830, 32958, 41273, 49401, 57716, 65536]; + data_sample_at_11111[76] = 70; + ref_cb[76] = vec![8224, 16588, 24952, 33316, 41680, 50044, 58408, 65536]; + data_sample_at_11111[77] = 228; + ref_cb[77] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536]; + data_sample_at_11111[78] = 0; + ref_cb[78] = vec![8128, 16338, 24578, 32818, 41058, 49298, 57538, 65536]; + data_sample_at_11111[79] = 0; + ref_cb[79] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536]; + data_sample_at_11111[80] = 50; + ref_cb[80] = vec![8384, 16704, 25024, 33344, 41664, 49984, 58304, 65536]; + data_sample_at_11111[81] = 214; + ref_cb[81] = vec![8215, 16575, 24935, 33295, 41655, 50015, 58375, 65536]; + data_sample_at_11111[82] = 0; + ref_cb[82] = vec![8254, 16654, 25054, 33454, 41854, 50254, 58654, 65536]; + data_sample_at_11111[83] = 0; + ref_cb[83] = vec![8293, 16522, 24751, 32980, 41209, 49438, 57667, 65536]; + data_sample_at_11111[84] = 50; + ref_cb[84] = vec![8128, 16388, 24656, 32924, 41192, 49460, 57728, 65536]; + data_sample_at_11111[85] = 69; + ref_cb[85] = vec![8371, 16678, 24985, 33292, 41599, 49906, 58213, 65536]; + data_sample_at_11111[86] = 0; + ref_cb[86] = vec![8196, 16324, 24674, 32802, 41152, 49280, 57630, 65536]; + data_sample_at_11111[87] = 0; + ref_cb[87] = vec![8234, 16619, 25004, 33389, 41774, 50159, 58544, 65536]; + data_sample_at_11111[88] = 70; + ref_cb[88] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536]; + data_sample_at_11111[89] = 136; + ref_cb[89] = vec![8128, 16339, 24585, 32831, 41077, 49323, 57569, 65536]; + data_sample_at_11111[90] = 0; + ref_cb[90] = vec![8348, 16632, 24916, 33200, 41484, 49768, 58052, 65536]; + data_sample_at_11111[91] = 0; + ref_cb[91] = vec![8386, 16708, 25030, 33352, 41674, 49996, 58318, 65536]; + data_sample_at_11111[92] = 101; + ref_cb[92] = vec![8204, 16564, 24924, 33284, 41644, 50004, 58364, 65536]; + data_sample_at_11111[93] = 36; + ref_cb[93] = vec![8241, 16639, 25037, 33435, 41833, 50231, 58629, 65536]; + data_sample_at_11111[94] = 196; + ref_cb[94] = vec![8278, 16492, 24706, 32920, 41134, 49348, 57562, 65536]; + data_sample_at_11111[95] = 0; + ref_cb[95] = vec![8315, 16566, 24817, 33068, 41319, 49570, 57821, 65536]; + data_sample_at_11111[96] = 0; + ref_cb[96] = vec![8352, 16640, 24928, 33216, 41504, 49792, 58080, 65536]; + data_sample_at_11111[97] = 24; + ref_cb[97] = vec![8389, 16714, 25039, 33364, 41689, 50014, 58339, 65536]; + data_sample_at_11111[98] = 8; + ref_cb[98] = vec![8200, 16562, 24924, 33286, 41648, 50010, 58372, 65536]; + data_sample_at_11111[99] = 0; + ref_cb[99] = vec![8236, 16635, 25034, 33433, 41832, 50231, 58630, 65536]; + data_sample_at_11111[100] = 0; + ref_cb[100] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536]; + data_sample_at_11111[101] = 125; + ref_cb[101] = vec![8308, 16552, 24796, 33040, 41284, 49528, 57772, 65536]; + data_sample_at_11111[102] = 173; + ref_cb[102] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536]; + data_sample_at_11111[103] = 126; + ref_cb[103] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536]; + data_sample_at_11111[104] = 0; + ref_cb[104] = vec![8416, 16544, 24888, 33016, 41360, 49488, 57832, 65536]; + data_sample_at_11111[105] = 0; + ref_cb[105] = vec![8219, 16607, 24995, 33383, 41771, 50159, 58547, 65536]; + data_sample_at_11111[106] = 159; + ref_cb[106] = vec![8254, 16678, 25102, 33526, 41950, 50374, 58798, 65536]; + data_sample_at_11111[107] = 210; + ref_cb[107] = vec![8289, 16514, 24739, 32964, 41189, 49414, 57639, 65536]; + data_sample_at_11111[108] = 178; + ref_cb[108] = vec![8324, 16584, 24844, 33104, 41364, 49624, 57884, 65536]; + data_sample_at_11111[109] = 0; + ref_cb[109] = vec![8359, 16654, 24949, 33244, 41539, 49834, 58129, 65536]; + data_sample_at_11111[110] = 0; + ref_cb[110] = vec![8394, 16724, 25054, 33384, 41714, 50044, 58374, 65536]; + data_sample_at_11111[111] = 170; + ref_cb[111] = vec![8429, 16794, 25159, 33524, 41889, 50254, 58619, 65536]; + data_sample_at_11111[112] = 173; + ref_cb[112] = vec![8224, 16624, 25024, 33424, 41824, 50224, 58624, 65536]; + data_sample_at_11111[113] = 235; + ref_cb[113] = vec![8258, 16452, 24646, 32840, 41034, 49228, 57422, 65536]; + data_sample_at_11111[114] = 0; + ref_cb[114] = vec![8292, 16520, 24748, 32976, 41204, 49432, 57660, 65536]; + data_sample_at_11111[115] = 0; + ref_cb[115] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536]; + data_sample_at_11111[116] = 0; + ref_cb[116] = vec![8360, 16656, 24952, 33248, 41544, 49840, 58136, 65536]; + data_sample_at_11111[117] = 24; + ref_cb[117] = vec![8394, 16724, 25054, 33384, 41714, 50044, 58374, 65536]; + data_sample_at_11111[118] = 228; + ref_cb[118] = vec![8428, 16792, 25156, 33520, 41884, 50248, 58612, 65536]; + data_sample_at_11111[119] = 0; + ref_cb[119] = vec![8215, 16613, 25011, 33409, 41807, 50205, 58603, 65536]; + data_sample_at_11111[120] = 0; + ref_cb[120] = vec![8248, 16680, 25112, 33544, 41976, 50408, 58840, 65536]; + data_sample_at_11111[121] = 0; + ref_cb[121] = vec![8281, 16498, 24715, 32932, 41149, 49366, 57583, 65536]; + data_sample_at_11111[122] = 101; + ref_cb[122] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536]; + data_sample_at_11111[123] = 174; + ref_cb[123] = vec![8347, 16630, 24913, 33196, 41479, 49762, 58045, 65536]; + data_sample_at_11111[124] = 126; + ref_cb[124] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536]; + data_sample_at_11111[125] = 0; + ref_cb[125] = vec![8413, 16762, 25111, 33460, 41809, 50158, 58507, 65536]; + data_sample_at_11111[126] = 0; + ref_cb[126] = vec![8192, 16574, 24956, 33338, 41720, 50102, 58484, 65536]; + data_sample_at_11111[127] = 0; + ref_cb[127] = vec![8224, 16639, 25054, 33469, 41884, 50299, 58714, 65536]; + + // Now run the loop with this reference data. + for i in 0..128 { + let data = get_triggering_base_data(65536, i); + + // This check is here so that the tests written against this chunker + // can verify that the test data input is correct. + assert_eq!(data[11111], data_sample_at_11111[i]); + + // Uncomment to create the line above. + // eprintln!("data_sample_at_11111[{i}]={};", data[11111]); + + // Now, run the chunks through the default chunker. + let chunks = Chunker::default().next_block(&data, true); + + // Get the boundaries indices as determined by the size of the chunks above. + let chunk_boundaries: Vec = get_chunk_boundaries(&chunks); + + // Uncomment this to generate the table above. + // eprintln!("ref_cb[{i}]=vec!{chunk_boundaries:?};"); + + assert_eq!(chunk_boundaries, ref_cb[i]); + } + + // eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})"); + // assert_eq!(chunk_boundaries, vec![131072, 262144, 393216, 524288, 655360, 786432, 917504, 1000000]) +} \ No newline at end of file diff --git a/packages/xetchunk-wasm/vendor/README.md b/packages/xetchunk-wasm/vendor/README.md new file mode 100644 index 0000000000..a1f2556344 --- /dev/null +++ b/packages/xetchunk-wasm/vendor/README.md @@ -0,0 +1 @@ +This is the WASM generated from the rust client \ No newline at end of file diff --git a/packages/xetchunk-wasm/vendor/chunker_wasm.d.ts b/packages/xetchunk-wasm/vendor/chunker_wasm.d.ts new file mode 100644 index 0000000000..33c9c7efcc --- /dev/null +++ b/packages/xetchunk-wasm/vendor/chunker_wasm.d.ts @@ -0,0 +1,9 @@ +/* tslint:disable */ +/* eslint-disable */ +export function compute_xorb_hash(chunks_array: any): string; +export class Chunker { + free(): void; + constructor(target_chunk_size: number); + add_data(data: Uint8Array): any; + finish(): any; +} diff --git a/packages/xetchunk-wasm/vendor/chunker_wasm.js b/packages/xetchunk-wasm/vendor/chunker_wasm.js new file mode 100644 index 0000000000..d689c94c90 --- /dev/null +++ b/packages/xetchunk-wasm/vendor/chunker_wasm.js @@ -0,0 +1,30 @@ +// export * from "./chunker_wasm_bg.js"; +import * as __glue_imports from "./chunker_wasm_bg.js"; + +const wasmUrl = new URL("./chunker_wasm_bg.wasm", import.meta.url); +const binary = await (await import("node:fs/promises")).readFile(wasmUrl); +// console.log("binary", binary); + +const wasmModule = await WebAssembly.compile(binary); +const imports = Object.entries( + WebAssembly.Module.imports(wasmModule).reduce( + (result, item) => ({ + ...result, + [item.module]: [...(result[item.module] || []), item.name], + }), + {} + ) +).map(([from, names]) => ({ from, names })); + +// const exports = WebAssembly.Module.exports(wasmModule).map((item) => item.name); + +// console.log("imports", imports); + +const wasm = await WebAssembly.instantiate(wasmModule, { + "./chunker_wasm_bg.js": Object.fromEntries(imports[0].names.map((name) => [name, __glue_imports[name]])), +}); +export * from "./chunker_wasm_bg.js"; +import { __wbg_set_wasm } from "./chunker_wasm_bg.js"; +__wbg_set_wasm(wasm.exports); +// console.log("exports", exports); +wasm.exports.__wbindgen_start(); diff --git a/packages/xetchunk-wasm/vendor/chunker_wasm_bg.js b/packages/xetchunk-wasm/vendor/chunker_wasm_bg.js new file mode 100644 index 0000000000..d8bfe1c7d9 --- /dev/null +++ b/packages/xetchunk-wasm/vendor/chunker_wasm_bg.js @@ -0,0 +1,490 @@ +let wasm; +export function __wbg_set_wasm(val) { + wasm = val; +} + + +let WASM_VECTOR_LEN = 0; + +let cachedUint8ArrayMemory0 = null; + +function getUint8ArrayMemory0() { + if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) { + cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer); + } + return cachedUint8ArrayMemory0; +} + +const lTextEncoder = typeof TextEncoder === 'undefined' ? (0, module.require)('util').TextEncoder : TextEncoder; + +let cachedTextEncoder = new lTextEncoder('utf-8'); + +const encodeString = (typeof cachedTextEncoder.encodeInto === 'function' + ? function (arg, view) { + return cachedTextEncoder.encodeInto(arg, view); +} + : function (arg, view) { + const buf = cachedTextEncoder.encode(arg); + view.set(buf); + return { + read: arg.length, + written: buf.length + }; +}); + +function passStringToWasm0(arg, malloc, realloc) { + + if (realloc === undefined) { + const buf = cachedTextEncoder.encode(arg); + const ptr = malloc(buf.length, 1) >>> 0; + getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf); + WASM_VECTOR_LEN = buf.length; + return ptr; + } + + let len = arg.length; + let ptr = malloc(len, 1) >>> 0; + + const mem = getUint8ArrayMemory0(); + + let offset = 0; + + for (; offset < len; offset++) { + const code = arg.charCodeAt(offset); + if (code > 0x7F) break; + mem[ptr + offset] = code; + } + + if (offset !== len) { + if (offset !== 0) { + arg = arg.slice(offset); + } + ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0; + const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len); + const ret = encodeString(arg, view); + + offset += ret.written; + ptr = realloc(ptr, len, offset, 1) >>> 0; + } + + WASM_VECTOR_LEN = offset; + return ptr; +} + +let cachedDataViewMemory0 = null; + +function getDataViewMemory0() { + if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) { + cachedDataViewMemory0 = new DataView(wasm.memory.buffer); + } + return cachedDataViewMemory0; +} + +function addToExternrefTable0(obj) { + const idx = wasm.__externref_table_alloc(); + wasm.__wbindgen_export_4.set(idx, obj); + return idx; +} + +function handleError(f, args) { + try { + return f.apply(this, args); + } catch (e) { + const idx = addToExternrefTable0(e); + wasm.__wbindgen_exn_store(idx); + } +} + +const lTextDecoder = typeof TextDecoder === 'undefined' ? (0, module.require)('util').TextDecoder : TextDecoder; + +let cachedTextDecoder = new lTextDecoder('utf-8', { ignoreBOM: true, fatal: true }); + +cachedTextDecoder.decode(); + +function getStringFromWasm0(ptr, len) { + ptr = ptr >>> 0; + return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len)); +} + +function debugString(val) { + // primitive types + const type = typeof val; + if (type == 'number' || type == 'boolean' || val == null) { + return `${val}`; + } + if (type == 'string') { + return `"${val}"`; + } + if (type == 'symbol') { + const description = val.description; + if (description == null) { + return 'Symbol'; + } else { + return `Symbol(${description})`; + } + } + if (type == 'function') { + const name = val.name; + if (typeof name == 'string' && name.length > 0) { + return `Function(${name})`; + } else { + return 'Function'; + } + } + // objects + if (Array.isArray(val)) { + const length = val.length; + let debug = '['; + if (length > 0) { + debug += debugString(val[0]); + } + for(let i = 1; i < length; i++) { + debug += ', ' + debugString(val[i]); + } + debug += ']'; + return debug; + } + // Test for built-in + const builtInMatches = /\[object ([^\]]+)\]/.exec(toString.call(val)); + let className; + if (builtInMatches && builtInMatches.length > 1) { + className = builtInMatches[1]; + } else { + // Failed to match the standard '[object ClassName]' + return toString.call(val); + } + if (className == 'Object') { + // we're a user defined class or Object + // JSON.stringify avoids problems with cycles, and is generally much + // easier than looping through ownProperties of `val`. + try { + return 'Object(' + JSON.stringify(val) + ')'; + } catch (_) { + return 'Object'; + } + } + // errors + if (val instanceof Error) { + return `${val.name}: ${val.message}\n${val.stack}`; + } + // TODO we could test for more things here, like `Set`s and `Map`s. + return className; +} + +function isLikeNone(x) { + return x === undefined || x === null; +} + +function passArray8ToWasm0(arg, malloc) { + const ptr = malloc(arg.length * 1, 1) >>> 0; + getUint8ArrayMemory0().set(arg, ptr / 1); + WASM_VECTOR_LEN = arg.length; + return ptr; +} + +function takeFromExternrefTable0(idx) { + const value = wasm.__wbindgen_export_4.get(idx); + wasm.__externref_table_dealloc(idx); + return value; +} +/** + * @param {any} chunks_array + * @returns {string} + */ +export function compute_xorb_hash(chunks_array) { + let deferred2_0; + let deferred2_1; + try { + const ret = wasm.compute_xorb_hash(chunks_array); + var ptr1 = ret[0]; + var len1 = ret[1]; + if (ret[3]) { + ptr1 = 0; len1 = 0; + throw takeFromExternrefTable0(ret[2]); + } + deferred2_0 = ptr1; + deferred2_1 = len1; + return getStringFromWasm0(ptr1, len1); + } finally { + wasm.__wbindgen_free(deferred2_0, deferred2_1, 1); + } +} + +const ChunkerFinalization = (typeof FinalizationRegistry === 'undefined') + ? { register: () => {}, unregister: () => {} } + : new FinalizationRegistry(ptr => wasm.__wbg_chunker_free(ptr >>> 0, 1)); + +export class Chunker { + + __destroy_into_raw() { + const ptr = this.__wbg_ptr; + this.__wbg_ptr = 0; + ChunkerFinalization.unregister(this); + return ptr; + } + + free() { + const ptr = this.__destroy_into_raw(); + wasm.__wbg_chunker_free(ptr, 0); + } + /** + * @param {number} target_chunk_size + */ + constructor(target_chunk_size) { + const ret = wasm.chunker_new(target_chunk_size); + this.__wbg_ptr = ret >>> 0; + ChunkerFinalization.register(this, this.__wbg_ptr, this); + return this; + } + /** + * @param {Uint8Array} data + * @returns {any} + */ + add_data(data) { + const ptr0 = passArray8ToWasm0(data, wasm.__wbindgen_malloc); + const len0 = WASM_VECTOR_LEN; + const ret = wasm.chunker_add_data(this.__wbg_ptr, ptr0, len0); + if (ret[2]) { + throw takeFromExternrefTable0(ret[1]); + } + return takeFromExternrefTable0(ret[0]); + } + /** + * @returns {any} + */ + finish() { + const ret = wasm.chunker_finish(this.__wbg_ptr); + if (ret[2]) { + throw takeFromExternrefTable0(ret[1]); + } + return takeFromExternrefTable0(ret[0]); + } +} + +export function __wbg_String_8f0eb39a4a4c2f66(arg0, arg1) { + const ret = String(arg1); + const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len1 = WASM_VECTOR_LEN; + getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true); + getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true); +}; + +export function __wbg_buffer_609cc3eee51ed158(arg0) { + const ret = arg0.buffer; + return ret; +}; + +export function __wbg_call_672a4d21634d4a24() { return handleError(function (arg0, arg1) { + const ret = arg0.call(arg1); + return ret; +}, arguments) }; + +export function __wbg_done_769e5ede4b31c67b(arg0) { + const ret = arg0.done; + return ret; +}; + +export function __wbg_get_67b2ba62fc30de12() { return handleError(function (arg0, arg1) { + const ret = Reflect.get(arg0, arg1); + return ret; +}, arguments) }; + +export function __wbg_get_b9b93047fe3cf45b(arg0, arg1) { + const ret = arg0[arg1 >>> 0]; + return ret; +}; + +export function __wbg_getwithrefkey_1dc361bd10053bfe(arg0, arg1) { + const ret = arg0[arg1]; + return ret; +}; + +export function __wbg_instanceof_ArrayBuffer_e14585432e3737fc(arg0) { + let result; + try { + result = arg0 instanceof ArrayBuffer; + } catch (_) { + result = false; + } + const ret = result; + return ret; +}; + +export function __wbg_instanceof_Uint8Array_17156bcf118086a9(arg0) { + let result; + try { + result = arg0 instanceof Uint8Array; + } catch (_) { + result = false; + } + const ret = result; + return ret; +}; + +export function __wbg_isArray_a1eab7e0d067391b(arg0) { + const ret = Array.isArray(arg0); + return ret; +}; + +export function __wbg_isSafeInteger_343e2beeeece1bb0(arg0) { + const ret = Number.isSafeInteger(arg0); + return ret; +}; + +export function __wbg_iterator_9a24c88df860dc65() { + const ret = Symbol.iterator; + return ret; +}; + +export function __wbg_length_a446193dc22c12f8(arg0) { + const ret = arg0.length; + return ret; +}; + +export function __wbg_length_e2d2a49132c1b256(arg0) { + const ret = arg0.length; + return ret; +}; + +export function __wbg_log_31c4454272417045(arg0, arg1) { + console.log(getStringFromWasm0(arg0, arg1)); +}; + +export function __wbg_new_405e22f390576ce2() { + const ret = new Object(); + return ret; +}; + +export function __wbg_new_78feb108b6472713() { + const ret = new Array(); + return ret; +}; + +export function __wbg_new_a12002a7f91c75be(arg0) { + const ret = new Uint8Array(arg0); + return ret; +}; + +export function __wbg_next_25feadfc0913fea9(arg0) { + const ret = arg0.next; + return ret; +}; + +export function __wbg_next_6574e1a8a62d1055() { return handleError(function (arg0) { + const ret = arg0.next(); + return ret; +}, arguments) }; + +export function __wbg_set_37837023f3d740e8(arg0, arg1, arg2) { + arg0[arg1 >>> 0] = arg2; +}; + +export function __wbg_set_3f1d0b984ed272ed(arg0, arg1, arg2) { + arg0[arg1] = arg2; +}; + +export function __wbg_set_65595bdd868b3009(arg0, arg1, arg2) { + arg0.set(arg1, arg2 >>> 0); +}; + +export function __wbg_value_cd1ffa7b1ab794f1(arg0) { + const ret = arg0.value; + return ret; +}; + +export function __wbindgen_as_number(arg0) { + const ret = +arg0; + return ret; +}; + +export function __wbindgen_boolean_get(arg0) { + const v = arg0; + const ret = typeof(v) === 'boolean' ? (v ? 1 : 0) : 2; + return ret; +}; + +export function __wbindgen_debug_string(arg0, arg1) { + const ret = debugString(arg1); + const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + const len1 = WASM_VECTOR_LEN; + getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true); + getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true); +}; + +export function __wbindgen_error_new(arg0, arg1) { + const ret = new Error(getStringFromWasm0(arg0, arg1)); + return ret; +}; + +export function __wbindgen_in(arg0, arg1) { + const ret = arg0 in arg1; + return ret; +}; + +export function __wbindgen_init_externref_table() { + const table = wasm.__wbindgen_export_4; + const offset = table.grow(4); + table.set(0, undefined); + table.set(offset + 0, undefined); + table.set(offset + 1, null); + table.set(offset + 2, true); + table.set(offset + 3, false); + ; +}; + +export function __wbindgen_is_function(arg0) { + const ret = typeof(arg0) === 'function'; + return ret; +}; + +export function __wbindgen_is_object(arg0) { + const val = arg0; + const ret = typeof(val) === 'object' && val !== null; + return ret; +}; + +export function __wbindgen_is_undefined(arg0) { + const ret = arg0 === undefined; + return ret; +}; + +export function __wbindgen_jsval_loose_eq(arg0, arg1) { + const ret = arg0 == arg1; + return ret; +}; + +export function __wbindgen_memory() { + const ret = wasm.memory; + return ret; +}; + +export function __wbindgen_number_get(arg0, arg1) { + const obj = arg1; + const ret = typeof(obj) === 'number' ? obj : undefined; + getDataViewMemory0().setFloat64(arg0 + 8 * 1, isLikeNone(ret) ? 0 : ret, true); + getDataViewMemory0().setInt32(arg0 + 4 * 0, !isLikeNone(ret), true); +}; + +export function __wbindgen_number_new(arg0) { + const ret = arg0; + return ret; +}; + +export function __wbindgen_string_get(arg0, arg1) { + const obj = arg1; + const ret = typeof(obj) === 'string' ? obj : undefined; + var ptr1 = isLikeNone(ret) ? 0 : passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc); + var len1 = WASM_VECTOR_LEN; + getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true); + getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true); +}; + +export function __wbindgen_string_new(arg0, arg1) { + const ret = getStringFromWasm0(arg0, arg1); + return ret; +}; + +export function __wbindgen_throw(arg0, arg1) { + throw new Error(getStringFromWasm0(arg0, arg1)); +}; + diff --git a/packages/xetchunk-wasm/vendor/chunker_wasm_bg.wasm b/packages/xetchunk-wasm/vendor/chunker_wasm_bg.wasm new file mode 100644 index 0000000000..bdcdb84fff Binary files /dev/null and b/packages/xetchunk-wasm/vendor/chunker_wasm_bg.wasm differ diff --git a/packages/xetchunk-wasm/vendor/chunker_wasm_bg.wasm.d.ts b/packages/xetchunk-wasm/vendor/chunker_wasm_bg.wasm.d.ts new file mode 100644 index 0000000000..1535d5a6fb --- /dev/null +++ b/packages/xetchunk-wasm/vendor/chunker_wasm_bg.wasm.d.ts @@ -0,0 +1,16 @@ +/* tslint:disable */ +/* eslint-disable */ +export const memory: WebAssembly.Memory; +export const __wbg_chunker_free: (a: number, b: number) => void; +export const chunker_new: (a: number) => number; +export const chunker_add_data: (a: number, b: number, c: number) => [number, number, number]; +export const chunker_finish: (a: number) => [number, number, number]; +export const compute_xorb_hash: (a: any) => [number, number, number, number]; +export const __wbindgen_malloc: (a: number, b: number) => number; +export const __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number; +export const __wbindgen_exn_store: (a: number) => void; +export const __externref_table_alloc: () => number; +export const __wbindgen_export_4: WebAssembly.Table; +export const __externref_table_dealloc: (a: number) => void; +export const __wbindgen_free: (a: number, b: number, c: number) => void; +export const __wbindgen_start: () => void; diff --git a/packages/xetchunk-wasm/vendor/package.json b/packages/xetchunk-wasm/vendor/package.json new file mode 100644 index 0000000000..47c13ae667 --- /dev/null +++ b/packages/xetchunk-wasm/vendor/package.json @@ -0,0 +1,17 @@ +{ + "name": "chunker-wasm", + "type": "module", + "version": "0.1.0", + "files": [ + "chunker_wasm_bg.wasm", + "chunker_wasm.js", + "chunker_wasm_bg.js", + "chunker_wasm.d.ts" + ], + "main": "chunker_wasm.js", + "types": "chunker_wasm.d.ts", + "sideEffects": [ + "./chunker_wasm.js", + "./snippets/*" + ] +} \ No newline at end of file diff --git a/packages/xetchunk-wasm/vitest.config.ts b/packages/xetchunk-wasm/vitest.config.ts new file mode 100644 index 0000000000..b6d61dc045 --- /dev/null +++ b/packages/xetchunk-wasm/vitest.config.ts @@ -0,0 +1,13 @@ +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + globals: true, + environment: "node", + }, + resolve: { + alias: { + "@huggingface/splitmix64-wasm": "./node_modules/@huggingface/splitmix64-wasm/build/release.js", + }, + }, +}); diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index 08e651bb73..9285ebfb63 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -14,3 +14,7 @@ packages: - "packages/ollama-utils" - "packages/mcp-client" - "packages/tiny-agents" + - "packages/gearhash-wasm" + - "packages/blake3-wasm" + - "packages/xetchunk-wasm" + - "packages/splitmix64-wasm" diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000000..fbe8ff6fda --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,13 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "node", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "lib": ["ESNext"], + "types": ["assemblyscript"] + } +}