Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Efficient Xet download #1295

Merged
merged 39 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b6572be
Add RangeList util
coyotte508 Mar 18, 2025
622763d
(wip) adapt download algorithm to use range list
coyotte508 Mar 18, 2025
44a8ebd
add RangeList tests
coyotte508 Mar 18, 2025
31845f5
Fix test
coyotte508 Mar 18, 2025
9ca4999
Merge branch 'main' into xet-download-optim
coyotte508 Mar 18, 2025
01eadbd
Debug browser tests
coyotte508 Mar 18, 2025
9effb60
more logs
coyotte508 Mar 18, 2025
a99ab19
more logs
coyotte508 Mar 18, 2025
48e1f4f
more logs
coyotte508 Mar 18, 2025
3e5fc9f
more logs
coyotte508 Mar 18, 2025
8388f98
fix browser test?
coyotte508 Mar 18, 2025
e08718c
fixup! fix browser test?
coyotte508 Mar 18, 2025
bc51527
fix when regurgating duplicated data
coyotte508 Mar 18, 2025
f0f88f4
set global timeout
coyotte508 Mar 18, 2025
990cadb
more logs
coyotte508 Mar 18, 2025
888b3b5
Remove logs in prod
coyotte508 Mar 18, 2025
d88fa83
fixup! Remove logs in prod
coyotte508 Mar 18, 2025
d559933
remove logging altogether
coyotte508 Mar 19, 2025
813ec20
add more unit tests
coyotte508 Mar 19, 2025
9122422
other data interval pattern
coyotte508 Mar 19, 2025
f688c11
Merge remote-tracking branch 'origin/main' into xet-download-optim
coyotte508 Mar 19, 2025
86ad611
remove 'only' in tests
coyotte508 Mar 19, 2025
d5d8150
do not run mocked tests on browser
coyotte508 Mar 19, 2025
c2cc1d8
Revert "remove logging altogether"
coyotte508 Mar 19, 2025
73490f8
add back logging to check why browser test fails
coyotte508 Mar 19, 2025
b405425
one more mocked test
coyotte508 Mar 19, 2025
9a02bc0
actually log stuff
coyotte508 Mar 19, 2025
4540349
tweak internal loggin
coyotte508 Mar 20, 2025
8d13c86
temporarily store test data
coyotte508 Mar 20, 2025
e287426
try mocked tests in browser again
coyotte508 Mar 20, 2025
524f284
log timestamps
coyotte508 Mar 20, 2025
716cfde
more logs
coyotte508 Mar 20, 2025
7750cc2
more logs
coyotte508 Mar 20, 2025
4828a0e
remove tmp txt log files
coyotte508 Mar 20, 2025
b09d748
listener in xet blob
coyotte508 Mar 20, 2025
0a2e0e9
log range from remote
coyotte508 Mar 20, 2025
d5729fd
more listener event
coyotte508 Mar 20, 2025
ea0d151
no date logs
coyotte508 Mar 20, 2025
83bdf2f
add a mocked test
coyotte508 Mar 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions packages/hub/src/lib/list-files.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ describe("listFiles", () => {
type: "file",
},
]);
}, 30_000);
});

it("should fetch the list of files from the repo, including last commit", async () => {
const cursor = listFiles({
Expand Down Expand Up @@ -146,7 +146,7 @@ describe("listFiles", () => {
type: "file",
},
]);
}, 30_000);
});

it("should fetch the list of files from the repo, including subfolders", async () => {
const cursor = listFiles({
Expand All @@ -165,5 +165,5 @@ describe("listFiles", () => {
}

assert(files.some((file) => file.path === "data/XSUM-EMNLP18-Summary-Data-Original.tar.gz"));
}, 30_000);
});
});
2 changes: 1 addition & 1 deletion packages/hub/src/lib/parse-safetensors-metadata.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ describe("parseSafetensorsMetadata", () => {
assert.deepStrictEqual(parse.parameterCount, { BF16: 176_247_271_424 });
assert.deepStrictEqual(sum(Object.values(parse.parameterCount)), 176_247_271_424);
// total params = 176B
}, 30_000);
});

it("fetch info for single-file with multiple dtypes", async () => {
const parse = await parseSafetensorsMetadata({
Expand Down
2 changes: 1 addition & 1 deletion packages/hub/src/lib/upload-file.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,5 @@ describe("uploadFile", () => {
hubUrl: TEST_HUB_URL,
});
}
}, 30_000);
});
});
96 changes: 96 additions & 0 deletions packages/hub/src/utils/RangeList.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import { describe, it, expect } from "vitest";
import { RangeList } from "./RangeList";

describe("RangeList", () => {
it("should add a single range", () => {
const rangeList = new RangeList();
rangeList.add(1, 100);

const ranges = rangeList.getAllRanges();
expect(ranges).toHaveLength(1);
expect(ranges[0]).toEqual({
start: 1,
end: 100,
refCount: 1,
data: null,
});
});

it("should handle overlapping ranges", () => {
const rangeList = new RangeList();
rangeList.add(1, 100);
rangeList.add(30, 50);

const ranges = rangeList.getAllRanges();
expect(ranges).toHaveLength(3);
expect(ranges).toEqual([
{ start: 1, end: 30, refCount: 1, data: null },
{ start: 30, end: 50, refCount: 2, data: null },
{ start: 50, end: 100, refCount: 1, data: null },
]);
});

it("should remove a range at existing boundaries", () => {
const rangeList = new RangeList();
rangeList.add(1, 100);
rangeList.add(30, 50);
rangeList.remove(30, 50);

const ranges = rangeList.getAllRanges();
expect(ranges).toHaveLength(3);
expect(ranges).toEqual([
{ start: 1, end: 30, refCount: 1, data: null },
{ start: 30, end: 50, refCount: 1, data: null },
{ start: 50, end: 100, refCount: 1, data: null },
]);
});

it("should throw error when removing range at non-existing boundaries", () => {
const rangeList = new RangeList();
rangeList.add(1, 100);
rangeList.add(30, 50);

expect(() => rangeList.remove(2, 50)).toThrow("Range boundaries must match existing boundaries");
});

it("should get ranges within boundaries", () => {
const rangeList = new RangeList();
rangeList.add(1, 100);
rangeList.add(30, 50);

const ranges = rangeList.getRanges(30, 100);
expect(ranges).toHaveLength(2);
expect(ranges).toEqual([
{ start: 30, end: 50, refCount: 2, data: null },
{ start: 50, end: 100, refCount: 1, data: null },
]);
});

it("should throw error when end is less than or equal to start", () => {
const rangeList = new RangeList();

expect(() => rangeList.add(100, 1)).toThrow("End must be greater than start");
expect(() => rangeList.add(1, 1)).toThrow("End must be greater than start");
expect(() => rangeList.remove(100, 1)).toThrow("End must be greater than start");
expect(() => rangeList.remove(1, 1)).toThrow("End must be greater than start");
expect(() => rangeList.getRanges(100, 1)).toThrow("End must be greater than start");
expect(() => rangeList.getRanges(1, 1)).toThrow("End must be greater than start");
});

it("should handle multiple overlapping ranges", () => {
const rangeList = new RangeList();
rangeList.add(1, 100);
rangeList.add(30, 50);
rangeList.add(40, 60);

const ranges = rangeList.getAllRanges();
expect(ranges).toHaveLength(5);
expect(ranges).toEqual([
{ start: 1, end: 30, refCount: 1, data: null },
{ start: 30, end: 40, refCount: 2, data: null },
{ start: 40, end: 50, refCount: 3, data: null },
{ start: 50, end: 60, refCount: 2, data: null },
{ start: 60, end: 100, refCount: 1, data: null },
]);
});
});
179 changes: 179 additions & 0 deletions packages/hub/src/utils/RangeList.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/**
* Code generated with this prompt by Cursor:
*
* I want to build a class to manage ranges
*
* I can add ranges to it with a start& an end (both integer, end > start). It should store those ranges efficently.
*
* When several ranges overlap, eg [1, 100] and [30, 50], I want the class to split the range into non-overlapping ranges, and add a "ref counter" to the ranges. For example, [1, 30], [30, 50] * 2, [50, 100]
*
* I also want to be able to remove ranges, it will decrease the ref counter or remove the range altogether. I can only remove ranges at existing boundaries. For example, with the [1, 30], [30, 50] * 2, [50, 100] configuration
*
* - removing [1, 100] => the only range remaning is [30, 50]
* - removing [2, 50] => error, because "2' is not a boundary
* - removing [30, 50] => [1, 30], [30, 50], [50, 100] (do not "merge" the ranges back together)
*
* I want to be able to associate data to each range. And I want to be able to get the ranges inside boundaries. For example , with [1, 30], [30, 50] * 2, [50, 100] configuration
*
* - getting [30, 100] => I receive [30, 50] * 2, [50, 100], and I can get / modify the data assocaited to each range by accessing their data prop. Note the "*2" is just the ref counter, there is onlly one range object for the interval returned
* - getting [2, 50] => I get [30, 50] * 2
*
* ----
*
* Could optimize with binary search, but the ranges we want to handle are not that many.
*/
interface Range<T> {
start: number;
end: number;
refCount: number;
data: T | null;
}

export class RangeList<T> {
private ranges: Range<T>[] = [];

/**
* Add a range to the list. If it overlaps with existing ranges,
* it will split them and increment reference counts accordingly.
*/
add(start: number, end: number): void {
if (end <= start) {
throw new TypeError("End must be greater than start");
}

// Find all ranges that overlap with the new range
const overlappingRanges: { index: number; range: Range<T> }[] = [];
for (let i = 0; i < this.ranges.length; i++) {
const range = this.ranges[i];
if (start < range.end && end > range.start) {
overlappingRanges.push({ index: i, range });
}
if (range.data !== null) {
throw new Error("Overlapping range already has data");
}
}

if (overlappingRanges.length === 0) {
// No overlaps, just add the new range
this.ranges.push({ start, end, refCount: 1, data: null });
this.ranges.sort((a, b) => a.start - b.start);
return;
}

// Handle overlaps by splitting ranges
const newRanges: Range<T>[] = [];
let currentPos = start;

for (let i = 0; i < overlappingRanges.length; i++) {
const { range } = overlappingRanges[i];

// Add range before overlap if exists
if (currentPos < range.start) {
newRanges.push({
start: currentPos,
end: range.start,
refCount: 1,
data: null,
});
} else if (range.start < currentPos) {
newRanges.push({
start: range.start,
end: currentPos,
refCount: range.refCount,
data: null,
});
}

// Add overlapping part with increased ref count
newRanges.push({
start: Math.max(currentPos, range.start),
end: Math.min(end, range.end),
refCount: range.refCount + 1,
data: null,
});

// Add remaining part of existing range if exists
if (range.end > end) {
newRanges.push({
start: end,
end: range.end,
refCount: range.refCount,
data: null,
});
}

currentPos = Math.max(currentPos, range.end);
}

// Add remaining part after last overlap if exists
if (currentPos < end) {
newRanges.push({
start: currentPos,
end,
refCount: 1,
data: null,
});
}

// Remove old overlapping ranges and insert new ones
const firstIndex = overlappingRanges[0].index;
const lastIndex = overlappingRanges[overlappingRanges.length - 1].index;
this.ranges.splice(firstIndex, lastIndex - firstIndex + 1, ...newRanges);
this.ranges.sort((a, b) => a.start - b.start);
}

/**
* Remove a range from the list. The range must start and end at existing boundaries.
*/
remove(start: number, end: number): void {
if (end <= start) {
throw new TypeError("End must be greater than start");
}

// Find ranges that need to be modified
const affectedRanges: { index: number; range: Range<T> }[] = [];
for (let i = 0; i < this.ranges.length; i++) {
const range = this.ranges[i];
if (start < range.end && end > range.start) {
affectedRanges.push({ index: i, range });
}
}

if (affectedRanges.length === 0) {
throw new Error("No ranges found to remove");
}

// Verify boundaries match
if (start !== affectedRanges[0].range.start || end !== affectedRanges[affectedRanges.length - 1].range.end) {
throw new Error("Range boundaries must match existing boundaries");
}

// Todo: also check if there's a gap in the middle but it should not happen with our usage

for (let i = 0; i < affectedRanges.length; i++) {
const { range } = affectedRanges[i];

range.refCount--;
}

this.ranges = this.ranges.filter((range) => range.refCount > 0);
}

/**
* Get all ranges within the specified boundaries.
*/
getRanges(start: number, end: number): Range<T>[] {
if (end <= start) {
throw new TypeError("End must be greater than start");
}

return this.ranges.filter((range) => start < range.end && end > range.start);
}

/**
* Get all ranges in the list
*/
getAllRanges(): Range<T>[] {
return [...this.ranges];
}
}
4 changes: 2 additions & 2 deletions packages/hub/src/utils/WebBlob.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ describe("WebBlob", () => {
expect(webBlob).toBeInstanceOf(WebBlob);
expect(webBlob).toMatchObject({ url });
expect(await webBlob.slice(10, 22).text()).toBe("__metadata__");
}, 30_000);
});

it("should lazy load a Xet file hosted on Hugging Face", async () => {
const stableDiffusionUrl =
Expand All @@ -70,7 +70,7 @@ describe("WebBlob", () => {
expect(webBlob).toBeInstanceOf(WebBlob);
expect(webBlob).toMatchObject({ url });
expect(await webBlob.slice(10, 22).text()).toBe("__metadata__");
}, 30_000);
});

it("should create a slice on the file", async () => {
const expectedText = fullText.slice(10, 20);
Expand Down
Loading