Skip to content

Commit 881c44b

Browse files
authored
Debug mode for xet upload (#1767)
cc @mishig25 @assafvayner This script will save all downloaded dedup shards, outputted xorbs & shards when uploading a file It also has a "replay" mode to reuse those Should help debug xet uploads
1 parent 219b254 commit 881c44b

File tree

4 files changed

+365
-5
lines changed

4 files changed

+365
-5
lines changed

packages/hub/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@
4343
"test:browser": "vitest run --browser.name=chrome --browser.headless --config vitest-browser.config.mts",
4444
"check": "tsc",
4545
"build:xet-wasm": "./scripts/build-xet-wasm.sh -t bundler --clean",
46-
"bench": "tsx scripts/bench.ts"
46+
"bench": "tsx scripts/bench.ts",
47+
"debug-xet": "tsx scripts/debug-xet.ts"
4748
},
4849
"files": [
4950
"src",

packages/hub/scripts/debug-xet.ts

Lines changed: 359 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
import { uploadShards } from "../src/utils/uploadShards.js";
2+
import { sha256 } from "../src/utils/sha256.js";
3+
import { parseArgs } from "node:util";
4+
import { join, basename } from "node:path";
5+
import { writeFile, readFile, stat, mkdir, readdir } from "node:fs/promises";
6+
import type { RepoId } from "../src/types/public.js";
7+
import { toRepoId } from "../src/utils/toRepoId.js";
8+
import { FileBlob } from "../src/utils/FileBlob.js";
9+
import { existsSync } from "node:fs";
10+
11+
/**
12+
* This script debugs xet uploads by capturing all network data locally
13+
* It takes a local file, repo, and token, then uploads while saving:
14+
* - Dedup shards as dedup_[chunk_hash]_shard.bin
15+
* - Uploaded xorbs as uploaded_xorb_1.bin, uploaded_xorb_2.bin, etc.
16+
* - Uploaded shards as uploaded_shard_1.bin, uploaded_shard_2.bin, etc.
17+
*
18+
* Normal mode: Captures all upload data to upload_[filename]/ directory
19+
* Replay mode: Validates upload data matches previously captured local files
20+
*
21+
* Usage:
22+
* pnpm --filter hub debug-xet -f <local_file> -t <write_token> -r <xet_repo>
23+
* pnpm --filter hub debug-xet -f <local_file> -t <write_token> -r <xet_repo> --replay
24+
*/
25+
26+
interface DebugFetchStats {
27+
xorbCount: number;
28+
shardCount: number;
29+
dedupShardCount: number;
30+
}
31+
32+
function createDebugFetch(args: { debugDir: string; replay?: boolean }): {
33+
fetch: typeof fetch;
34+
getStats: () => DebugFetchStats;
35+
} {
36+
let xorbCount = 0;
37+
let shardCount = 0;
38+
let dedupShardCount = 0;
39+
40+
const debugFetch = async function (input: string | URL | Request, init?: RequestInit): Promise<Response> {
41+
const url = typeof input === "string" ? input : input.toString();
42+
43+
// Handle xorb uploads - capture the xorb data
44+
if (url.includes("/xorbs/")) {
45+
xorbCount++;
46+
const xorbFilename = `uploaded_xorb_${xorbCount}.bin`;
47+
const xorbPath = join(args.debugDir, xorbFilename);
48+
49+
if (init?.body) {
50+
const uploadData = init.body as Uint8Array;
51+
52+
if (args.replay) {
53+
// In replay mode, compare with existing local file
54+
55+
const localData = await readFile(xorbPath);
56+
if (localData.length !== uploadData.length || !localData.every((byte, i) => byte === uploadData[i])) {
57+
console.error(`❌ Xorb data mismatch: ${xorbFilename}`);
58+
console.error(` Local size: ${localData.length}, Upload size: ${uploadData.length}`);
59+
throw new Error(`Xorb validation failed for ${xorbFilename}`);
60+
}
61+
console.log(`✅ Xorb validation passed: ${xorbFilename} - xorb file is the same as generated previously`);
62+
return new Response(null, { status: 200 });
63+
} else {
64+
// In normal mode, save the data
65+
await writeFile(xorbPath, uploadData);
66+
console.log(`💾 Saved xorb to ${xorbFilename} (${uploadData.length} bytes)`);
67+
}
68+
}
69+
70+
// Forward the real request to backend
71+
const realResponse = await fetch(input, init);
72+
console.log(`📤 Xorb upload ${xorbCount}: ${realResponse.status} ${realResponse.statusText}`);
73+
return realResponse;
74+
}
75+
76+
// Handle shard uploads - capture the shard data
77+
if (url.endsWith("/shards")) {
78+
shardCount++;
79+
const shardFilename = `uploaded_shard_${shardCount}.bin`;
80+
const shardPath = join(args.debugDir, shardFilename);
81+
82+
if (init?.body) {
83+
const uploadData = init.body as Uint8Array;
84+
85+
if (args.replay) {
86+
// In replay mode, compare with existing local file
87+
const localData = await readFile(shardPath);
88+
if (localData.length !== uploadData.length) {
89+
console.error(`❌ Shard data mismatch: ${shardFilename}`);
90+
console.error(` Local size: ${localData.length}, Upload size: ${uploadData.length}`);
91+
throw new Error(`Shard validation failed for ${shardFilename}`);
92+
}
93+
94+
// Compare all bytes except footer bytes 104-112 (9 bytes from positions 104-112 inclusive)
95+
const footerStart = Number(
96+
new DataView(localData.buffer).getBigUint64(localData.buffer.byteLength - 8, true)
97+
);
98+
// This is the shard timestamp
99+
const toIgnoreStart = footerStart + 104;
100+
const toIgnoreEnd = footerStart + 112;
101+
102+
const mismatch = localData.some((byte, i) => {
103+
if (i >= toIgnoreStart && i < toIgnoreEnd) {
104+
return false;
105+
}
106+
return byte !== uploadData[i];
107+
});
108+
109+
if (mismatch) {
110+
console.error(`❌ Shard data mismatch: ${shardFilename}`);
111+
console.error(` Local size: ${localData.length}, Upload size: ${uploadData.length}`);
112+
throw new Error(`Shard validation failed for ${shardFilename}`);
113+
}
114+
console.log(`✅ Shard validation passed: ${shardFilename} - shard file is the same as generated previously`);
115+
116+
// Do not mock the shard call
117+
//return new Response(null, { status: 200 });
118+
} else {
119+
// In normal mode, save the data
120+
await writeFile(shardPath, uploadData);
121+
console.log(`💾 Saved shard to ${shardFilename} (${uploadData.length} bytes)`);
122+
}
123+
}
124+
125+
// Forward the real request to backend
126+
const realResponse = await fetch(input, init);
127+
console.log(`📤 Shard upload ${shardCount}: ${realResponse.status} ${realResponse.statusText}`);
128+
return realResponse;
129+
}
130+
131+
// Handle dedup info requests - save or replay locally
132+
if (url.includes("/chunks/")) {
133+
if (args.replay) {
134+
// In replay mode, try to load from local files
135+
const urlParts = url.split("/");
136+
const chunkHash = urlParts[urlParts.length - 1];
137+
const dedupFilename = `dedup_${chunkHash}_shard.bin`;
138+
const dedupPath = join(args.debugDir, dedupFilename);
139+
140+
try {
141+
const localData = await readFile(dedupPath);
142+
console.log(`🔄 Replaying dedup info from ${dedupFilename}`);
143+
return new Response(localData, { status: 200 });
144+
} catch (error) {
145+
return new Response(null, { status: 404 });
146+
}
147+
}
148+
149+
// Forward to real backend and save response
150+
const realResponse = await fetch(input, init);
151+
152+
if (realResponse.ok && realResponse.body) {
153+
const urlParts = url.split("/");
154+
const chunkHash = urlParts[urlParts.length - 1];
155+
const dedupFilename = `dedup_${chunkHash}_shard.bin`;
156+
const dedupPath = join(args.debugDir, dedupFilename);
157+
158+
const responseData = await realResponse.arrayBuffer();
159+
await writeFile(dedupPath, new Uint8Array(responseData));
160+
161+
dedupShardCount++;
162+
console.log(`💾 Saved dedup info to ${dedupFilename} (${responseData.byteLength} bytes)`);
163+
164+
// Return a new response with the same data
165+
return new Response(responseData, {
166+
status: realResponse.status,
167+
statusText: realResponse.statusText,
168+
headers: realResponse.headers,
169+
});
170+
}
171+
172+
return realResponse;
173+
}
174+
175+
// For all other requests, use real fetch
176+
return fetch(input, init);
177+
};
178+
179+
return {
180+
fetch: debugFetch,
181+
getStats: () => ({ xorbCount, shardCount, dedupShardCount }),
182+
};
183+
}
184+
185+
async function* createFileSource(filepath: string): AsyncGenerator<{
186+
content: Blob;
187+
path: string;
188+
sha256: string;
189+
}> {
190+
const filename = basename(filepath);
191+
console.log(`Processing ${filename}...`);
192+
193+
const blob: Blob = await FileBlob.create(filepath);
194+
195+
// Calculate sha256
196+
console.log(`Calculating SHA256 for ${filename}...`);
197+
const sha256Iterator = sha256(blob, { useWebWorker: false });
198+
let res: IteratorResult<number, string>;
199+
do {
200+
res = await sha256Iterator.next();
201+
} while (!res.done);
202+
const sha256Hash = res.value;
203+
204+
console.log(`SHA256 for ${filename}: ${sha256Hash}`);
205+
206+
yield {
207+
content: blob,
208+
path: filename,
209+
sha256: sha256Hash,
210+
};
211+
}
212+
213+
async function main() {
214+
const { values: args } = parseArgs({
215+
options: {
216+
token: {
217+
type: "string",
218+
short: "t",
219+
},
220+
repo: {
221+
type: "string",
222+
short: "r",
223+
},
224+
file: {
225+
type: "string",
226+
short: "f",
227+
},
228+
replay: {
229+
type: "boolean",
230+
default: false,
231+
},
232+
},
233+
});
234+
235+
if (!args.token || !args.repo || !args.file) {
236+
console.error("Usage: pnpm --filter hub debug-xet -f <local_file> -t <write_token> -r <xet_repo>");
237+
console.error("Example: pnpm --filter hub debug-xet -f ./model.bin -t hf_... -r myuser/myrepo");
238+
console.error("Options:");
239+
console.error(" --replay Use local dedup info instead of remote");
240+
process.exit(1);
241+
}
242+
243+
if (!existsSync(args.file)) {
244+
console.error(`❌ File ${args.file} does not exist`);
245+
process.exit(1);
246+
}
247+
248+
const filename = basename(args.file);
249+
const debugDir = `upload_${filename}`;
250+
251+
// Handle debug directory based on mode
252+
if (args.replay) {
253+
// In replay mode, directory must exist
254+
if (!existsSync(debugDir)) {
255+
console.error(`❌ Debug directory ${debugDir} does not exist`);
256+
console.error(` Run without --replay first to capture upload data`);
257+
process.exit(1);
258+
}
259+
console.log(`📁 Using existing debug directory: ${debugDir}`);
260+
} else {
261+
// In normal mode, directory must not exist
262+
if (existsSync(debugDir)) {
263+
console.error(`❌ Debug directory ${debugDir} already exists`);
264+
console.error(` Please remove it first: rm -rf ${debugDir}`);
265+
process.exit(1);
266+
}
267+
268+
// Create debug directory
269+
await mkdir(debugDir, { recursive: true });
270+
console.log(`📁 Created debug directory: ${debugDir}`);
271+
}
272+
273+
// Parse repo
274+
const repo: RepoId = toRepoId(args.repo);
275+
276+
// Create debug fetch
277+
const debugFetchObj = createDebugFetch({
278+
debugDir,
279+
replay: args.replay,
280+
});
281+
282+
// Setup upload parameters
283+
const uploadParams = {
284+
accessToken: args.token,
285+
hubUrl: "https://huggingface.co",
286+
fetch: debugFetchObj.fetch,
287+
repo,
288+
rev: "main",
289+
};
290+
291+
console.log(`\n=== Starting debug upload for ${filename} ===`);
292+
if (args.replay) {
293+
console.log("🔄 Replay mode: Using local dedup info when available");
294+
}
295+
296+
// Get file stats
297+
const fileStats = await stat(args.file);
298+
console.log(`📄 File size: ${(fileStats.size / 1024 / 1024).toFixed(2)} MB`);
299+
300+
// Process file through uploadShards
301+
const fileSource = createFileSource(args.file);
302+
303+
let dedupRatio = 0;
304+
let fileSha256 = "";
305+
306+
for await (const event of uploadShards(fileSource, uploadParams)) {
307+
switch (event.event) {
308+
case "file": {
309+
console.log(`\n✅ Upload completed for: ${event.path}`);
310+
console.log(` SHA256: ${event.sha256}`);
311+
console.log(` Dedup ratio: ${(event.dedupRatio * 100).toFixed(2)}%`);
312+
313+
dedupRatio = event.dedupRatio;
314+
fileSha256 = event.sha256;
315+
break;
316+
}
317+
318+
case "fileProgress": {
319+
// Progress already logged in yieldCallback
320+
break;
321+
}
322+
}
323+
}
324+
325+
// Get final stats from debug fetch
326+
const stats = debugFetchObj.getStats();
327+
328+
console.log("\n=== DEBUG UPLOAD RESULTS ===");
329+
console.log(`📁 Debug directory: ${debugDir}`);
330+
console.log(`📄 Original file: ${filename} (${(fileStats.size / 1024 / 1024).toFixed(2)} MB)`);
331+
console.log(`🔒 SHA256: ${fileSha256}`);
332+
console.log(`📊 Deduplication: ${(dedupRatio * 100).toFixed(2)}%`);
333+
console.log(`📤 Network calls:`);
334+
console.log(` - ${stats.xorbCount} xorb uploads`);
335+
console.log(` - ${stats.shardCount} shard uploads`);
336+
console.log(` - ${stats.dedupShardCount} dedup info downloads`);
337+
338+
// List all captured files
339+
const capturedFiles = await readdir(debugDir);
340+
console.log(`\n💾 Captured ${capturedFiles.length} files:`);
341+
for (const file of capturedFiles.sort()) {
342+
const filePath = join(debugDir, file);
343+
const fileInfo = await stat(filePath);
344+
console.log(` - ${file} (${fileInfo.size.toLocaleString()} bytes)`);
345+
}
346+
347+
if (args.replay) {
348+
console.log(`\n✅ Replay validation completed successfully!`);
349+
console.log(` All uploaded data matched local files`);
350+
} else {
351+
console.log(`\n🚀 Debug upload completed successfully!`);
352+
console.log(` Use --replay flag to test with local dedup data`);
353+
}
354+
}
355+
356+
main().catch((error) => {
357+
console.error("❌ Error:", error);
358+
process.exit(1);
359+
});

packages/hub/src/utils/createXorbs.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class CurrentXorbInfo {
8585
export async function* createXorbs(
8686
fileSources: AsyncGenerator<{ content: Blob; path: string; sha256: string }>,
8787
params: XetWriteTokenParams & {
88-
yieldCallback: (event: { event: "fileProgress"; path: string; progress: number }) => void;
88+
yieldCallback?: (event: { event: "fileProgress"; path: string; progress: number }) => void;
8989
}
9090
): AsyncGenerator<
9191
| XorbEvent
@@ -286,7 +286,7 @@ export async function* createXorbs(
286286
if (bytesSinceLastProgressEvent >= 1_000_000) {
287287
// Emit half of the progress when processed locally, other half when uploading the xorb
288288
bytesSinceLastProgressEvent = 0;
289-
params.yieldCallback({
289+
params.yieldCallback?.({
290290
event: "fileProgress",
291291
path: fileSource.path,
292292
progress:

0 commit comments

Comments
 (0)