Skip to content

Commit 47c0ad4

Browse files
authored
XetBlob util (#1257)
Python lib code: https://github.com/huggingface-internal/xetpoc_huggingface_hub/pull/14/files (cc @bpronan @Wauplin @hanouticelina) # Todo Add more test cases - [x] on a larger amount of data (need to fetch several terms) - [x] when offset_into_first_range > a chunk length - [ ] when term.range.start > fetchinfo.range.start => created #1285 - [x] check range request when requesting range at exact chunk boundaries, or just before/just after
1 parent 1af8fd0 commit 47c0ad4

14 files changed

+1457
-8
lines changed

.github/workflows/test.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,13 @@ jobs:
6161
echo "SINCE=$(git merge-base origin/${{ github.event.pull_request.base.ref }} ${{ github.sha }})" >> $GITHUB_OUTPUT
6262
fi
6363
64+
- run: google-chrome --version
65+
6466
- run: npm install -g corepack@latest && corepack enable
6567

6668
- uses: actions/setup-node@v3
6769
with:
68-
node-version: "20"
70+
node-version: "22"
6971
cache: "pnpm"
7072
cache-dependency-path: "**/pnpm-lock.yaml"
7173
- run: |

CONTRIBUTING.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@ It's not a hard requirement, but please consider using an icon from [Gitmoji](ht
1818

1919
## Tests
2020

21-
If you want to run only specific tests, you can do `pnpm test -- -t "test name"`
21+
If you want to run only specific tests, you can do `pnpm test -- -t "test name"`.
22+
23+
You can also do `npx vitest ./packages/hub/src/utils/XetBlob.spec.ts` to run a specific test file.
24+
25+
Or `cd packages/hub && npx vitest --browser.name=chrome --browser.headless --config vitest-browser.config.mts ./src/utils/XetBlob.spec.ts` to run browser tests on a specific file
2226

2327
## Adding a package
2428

packages/hub/.eslintignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
dist
22
sha256.js
3+
src/vendor

packages/hub/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,4 @@ Under the hood, `@huggingface/hub` uses a lazy blob implementation to load the f
174174
## Dependencies
175175

176176
- `@huggingface/tasks` : Typings only
177+
- `@huggingface/lz4` : URL join utility

packages/hub/src/index.ts

+1
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ export { HubApiError, InvalidApiResponseFormatError } from "./error";
2222
* Only exported for E2Es convenience
2323
*/
2424
export { sha256 as __internal_sha256 } from "./utils/sha256";
25+
export { XetBlob as __internal_XetBlob } from "./utils/XetBlob";

packages/hub/src/lib/list-files.spec.ts

+3-3
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ describe("listFiles", () => {
6060
type: "file",
6161
},
6262
]);
63-
});
63+
}, 30_000);
6464

6565
it("should fetch the list of files from the repo, including last commit", async () => {
6666
const cursor = listFiles({
@@ -146,7 +146,7 @@ describe("listFiles", () => {
146146
type: "file",
147147
},
148148
]);
149-
});
149+
}, 30_000);
150150

151151
it("should fetch the list of files from the repo, including subfolders", async () => {
152152
const cursor = listFiles({
@@ -165,5 +165,5 @@ describe("listFiles", () => {
165165
}
166166

167167
assert(files.some((file) => file.path === "data/XSUM-EMNLP18-Summary-Data-Original.tar.gz"));
168-
});
168+
}, 30_000);
169169
});

packages/hub/src/utils/WebBlob.spec.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ describe("WebBlob", () => {
5858
expect(webBlob).toBeInstanceOf(WebBlob);
5959
expect(webBlob).toMatchObject({ url });
6060
expect(await webBlob.slice(10, 22).text()).toBe("__metadata__");
61-
});
61+
}, 30_000);
6262

6363
it("should lazy load a Xet file hosted on Hugging Face", async () => {
6464
const stableDiffusionUrl =
@@ -70,7 +70,7 @@ describe("WebBlob", () => {
7070
expect(webBlob).toBeInstanceOf(WebBlob);
7171
expect(webBlob).toMatchObject({ url });
7272
expect(await webBlob.slice(10, 22).text()).toBe("__metadata__");
73-
});
73+
}, 30_000);
7474

7575
it("should create a slice on the file", async () => {
7676
const expectedText = fullText.slice(10, 20);

packages/hub/src/utils/WebBlob.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ export class WebBlob extends Blob {
6060

6161
override slice(start = 0, end = this.size): WebBlob {
6262
if (start < 0 || end < 0) {
63-
new TypeError("Unsupported negative start/end on FileBlob.slice");
63+
new TypeError("Unsupported negative start/end on WebBlob.slice");
6464
}
6565

6666
const slice = new WebBlob(
+185
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import { describe, expect, it } from "vitest";
2+
import { bg4_regoup_bytes, XetBlob } from "./XetBlob";
3+
4+
describe("XetBlob", () => {
5+
it("should lazy load the first 22 bytes", async () => {
6+
const blob = new XetBlob({
7+
repo: {
8+
type: "model",
9+
name: "celinah/xet-experiments",
10+
},
11+
hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b",
12+
size: 5_234_139_343,
13+
});
14+
15+
expect(await blob.slice(10, 22).text()).toBe("__metadata__");
16+
}, 30_000);
17+
18+
it("should load the first chunk correctly", async () => {
19+
let xorbCount = 0;
20+
const blob = new XetBlob({
21+
repo: {
22+
type: "model",
23+
name: "celinah/xet-experiments",
24+
},
25+
hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b",
26+
size: 5_234_139_343,
27+
fetch: async (url, opts) => {
28+
if (typeof url === "string" && url.includes("/xorbs/")) {
29+
xorbCount++;
30+
}
31+
return fetch(url, opts);
32+
},
33+
});
34+
35+
const xetDownload = await blob.slice(0, 29928).arrayBuffer();
36+
const bridgeDownload = await fetch(
37+
"https://huggingface.co/celinah/xet-experiments/resolve/main/model5GB.safetensors",
38+
{
39+
headers: {
40+
Range: "bytes=0-29927",
41+
},
42+
}
43+
).then((res) => res.arrayBuffer());
44+
45+
expect(new Uint8Array(xetDownload)).toEqual(new Uint8Array(bridgeDownload));
46+
expect(xorbCount).toBe(1);
47+
}, 30_000);
48+
49+
it("should load just past the first chunk correctly", async () => {
50+
let xorbCount = 0;
51+
const blob = new XetBlob({
52+
repo: {
53+
type: "model",
54+
name: "celinah/xet-experiments",
55+
},
56+
hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b",
57+
size: 5_234_139_343,
58+
fetch: async (url, opts) => {
59+
if (typeof url === "string" && url.includes("/xorbs/")) {
60+
xorbCount++;
61+
}
62+
return fetch(url, opts);
63+
},
64+
});
65+
66+
const xetDownload = await blob.slice(0, 29929).arrayBuffer();
67+
const bridgeDownload = await fetch(
68+
"https://huggingface.co/celinah/xet-experiments/resolve/main/model5GB.safetensors",
69+
{
70+
headers: {
71+
Range: "bytes=0-29928",
72+
},
73+
}
74+
).then((res) => res.arrayBuffer());
75+
76+
expect(new Uint8Array(xetDownload)).toEqual(new Uint8Array(bridgeDownload));
77+
expect(xetDownload.byteLength).toBe(29929);
78+
expect(xorbCount).toBe(2);
79+
});
80+
81+
// In github actions, this test doesn't work inside the browser, but it works locally
82+
// inside both chrome and chromium browsers
83+
// TODO: figure out why
84+
if (typeof window === "undefined") {
85+
it("should load correctly when loading far into a chunk range", async () => {
86+
const blob = new XetBlob({
87+
repo: {
88+
type: "model",
89+
name: "celinah/xet-experiments",
90+
},
91+
hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b",
92+
size: 5_234_139_343,
93+
});
94+
95+
const xetDownload = await blob.slice(10_000_000, 10_100_000).arrayBuffer();
96+
const bridgeDownload = await fetch(
97+
"https://huggingface.co/celinah/xet-experiments/resolve/main/model5GB.safetensors",
98+
{
99+
headers: {
100+
Range: "bytes=10000000-10099999",
101+
},
102+
}
103+
).then((res) => res.arrayBuffer());
104+
105+
console.log("xet", xetDownload.byteLength, "bridge", bridgeDownload.byteLength);
106+
expect(new Uint8Array(xetDownload)).toEqual(new Uint8Array(bridgeDownload));
107+
}, 30_000);
108+
}
109+
110+
it("should load text correctly when offset_into_range starts in a chunk further than the first", async () => {
111+
const blob = new XetBlob({
112+
repo: {
113+
type: "model",
114+
name: "celinah/xet-experiments",
115+
},
116+
hash: "794efea76d8cb372bbe1385d9e51c3384555f3281e629903ecb6abeff7d54eec",
117+
size: 62_914_580,
118+
});
119+
120+
// Reconstruction info
121+
// {
122+
// "offset_into_first_range": 600000,
123+
// "terms":
124+
// [
125+
// {
126+
// "hash": "be748f77930d5929cabd510a15f2c30f2f460b639804ef79dea46affa04fd8b2",
127+
// "unpacked_length": 655360,
128+
// "range": { "start": 0, "end": 5 },
129+
// },
130+
// {
131+
// "hash": "be748f77930d5929cabd510a15f2c30f2f460b639804ef79dea46affa04fd8b2",
132+
// "unpacked_length": 655360,
133+
// "range": { "start": 0, "end": 5 },
134+
// },
135+
// ],
136+
// "fetch_info":
137+
// {
138+
// "be748f77930d5929cabd510a15f2c30f2f460b639804ef79dea46affa04fd8b2":
139+
// [
140+
// {
141+
// "range": { "start": 0, "end": 5 },
142+
// "url": "...",
143+
// "url_range": { "start": 0, "end": 2839 },
144+
// },
145+
// ],
146+
// },
147+
// }
148+
149+
const text = await blob.slice(600_000, 700_000).text();
150+
const bridgeDownload = await fetch("https://huggingface.co/celinah/xet-experiments/resolve/main/large_text.txt", {
151+
headers: {
152+
Range: "bytes=600000-699999",
153+
},
154+
}).then((res) => res.text());
155+
156+
console.log("xet", text.length, "bridge", bridgeDownload.length);
157+
expect(text.length).toBe(bridgeDownload.length);
158+
}, 30_000);
159+
160+
describe("bg4_regoup_bytes", () => {
161+
it("should regroup bytes when the array is %4 length", () => {
162+
expect(bg4_regoup_bytes(new Uint8Array([1, 5, 2, 6, 3, 7, 4, 8]))).toEqual(
163+
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8])
164+
);
165+
});
166+
167+
it("should regroup bytes when the array is %4 + 1 length", () => {
168+
expect(bg4_regoup_bytes(new Uint8Array([1, 5, 9, 2, 6, 3, 7, 4, 8]))).toEqual(
169+
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9])
170+
);
171+
});
172+
173+
it("should regroup bytes when the array is %4 + 2 length", () => {
174+
expect(bg4_regoup_bytes(new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 4, 8]))).toEqual(
175+
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
176+
);
177+
});
178+
179+
it("should regroup bytes when the array is %4 + 3 length", () => {
180+
expect(bg4_regoup_bytes(new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8]))).toEqual(
181+
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
182+
);
183+
});
184+
});
185+
});

0 commit comments

Comments
 (0)