|
| 1 | +import { uploadShards } from "../src/utils/uploadShards.js"; |
| 2 | +import { sha256 } from "../src/utils/sha256.js"; |
| 3 | +import { parseArgs } from "node:util"; |
| 4 | +import { join, basename } from "node:path"; |
| 5 | +import { writeFile, readFile, stat, mkdir, readdir } from "node:fs/promises"; |
| 6 | +import type { RepoId } from "../src/types/public.js"; |
| 7 | +import { toRepoId } from "../src/utils/toRepoId.js"; |
| 8 | +import { FileBlob } from "../src/utils/FileBlob.js"; |
| 9 | +import { existsSync } from "node:fs"; |
| 10 | + |
| 11 | +/** |
| 12 | + * This script debugs xet uploads by capturing all network data locally |
| 13 | + * It takes a local file, repo, and token, then uploads while saving: |
| 14 | + * - Dedup shards as dedup_[chunk_hash]_shard.bin |
| 15 | + * - Uploaded xorbs as uploaded_xorb_1.bin, uploaded_xorb_2.bin, etc. |
| 16 | + * - Uploaded shards as uploaded_shard_1.bin, uploaded_shard_2.bin, etc. |
| 17 | + * |
| 18 | + * Normal mode: Captures all upload data to upload_[filename]/ directory |
| 19 | + * Replay mode: Validates upload data matches previously captured local files |
| 20 | + * |
| 21 | + * Usage: |
| 22 | + * pnpm --filter hub debug-xet -f <local_file> -t <write_token> -r <xet_repo> |
| 23 | + * pnpm --filter hub debug-xet -f <local_file> -t <write_token> -r <xet_repo> --replay |
| 24 | + */ |
| 25 | + |
| 26 | +interface DebugFetchStats { |
| 27 | + xorbCount: number; |
| 28 | + shardCount: number; |
| 29 | + dedupShardCount: number; |
| 30 | +} |
| 31 | + |
| 32 | +function createDebugFetch(args: { debugDir: string; replay?: boolean }): { |
| 33 | + fetch: typeof fetch; |
| 34 | + getStats: () => DebugFetchStats; |
| 35 | +} { |
| 36 | + let xorbCount = 0; |
| 37 | + let shardCount = 0; |
| 38 | + let dedupShardCount = 0; |
| 39 | + |
| 40 | + const debugFetch = async function (input: string | URL | Request, init?: RequestInit): Promise<Response> { |
| 41 | + const url = typeof input === "string" ? input : input.toString(); |
| 42 | + |
| 43 | + // Handle xorb uploads - capture the xorb data |
| 44 | + if (url.includes("/xorbs/")) { |
| 45 | + xorbCount++; |
| 46 | + const xorbFilename = `uploaded_xorb_${xorbCount}.bin`; |
| 47 | + const xorbPath = join(args.debugDir, xorbFilename); |
| 48 | + |
| 49 | + if (init?.body) { |
| 50 | + const uploadData = init.body as Uint8Array; |
| 51 | + |
| 52 | + if (args.replay) { |
| 53 | + // In replay mode, compare with existing local file |
| 54 | + |
| 55 | + const localData = await readFile(xorbPath); |
| 56 | + if (localData.length !== uploadData.length || !localData.every((byte, i) => byte === uploadData[i])) { |
| 57 | + console.error(`❌ Xorb data mismatch: ${xorbFilename}`); |
| 58 | + console.error(` Local size: ${localData.length}, Upload size: ${uploadData.length}`); |
| 59 | + throw new Error(`Xorb validation failed for ${xorbFilename}`); |
| 60 | + } |
| 61 | + console.log(`✅ Xorb validation passed: ${xorbFilename} - xorb file is the same as generated previously`); |
| 62 | + return new Response(null, { status: 200 }); |
| 63 | + } else { |
| 64 | + // In normal mode, save the data |
| 65 | + await writeFile(xorbPath, uploadData); |
| 66 | + console.log(`💾 Saved xorb to ${xorbFilename} (${uploadData.length} bytes)`); |
| 67 | + } |
| 68 | + } |
| 69 | + |
| 70 | + // Forward the real request to backend |
| 71 | + const realResponse = await fetch(input, init); |
| 72 | + console.log(`📤 Xorb upload ${xorbCount}: ${realResponse.status} ${realResponse.statusText}`); |
| 73 | + return realResponse; |
| 74 | + } |
| 75 | + |
| 76 | + // Handle shard uploads - capture the shard data |
| 77 | + if (url.endsWith("/shards")) { |
| 78 | + shardCount++; |
| 79 | + const shardFilename = `uploaded_shard_${shardCount}.bin`; |
| 80 | + const shardPath = join(args.debugDir, shardFilename); |
| 81 | + |
| 82 | + if (init?.body) { |
| 83 | + const uploadData = init.body as Uint8Array; |
| 84 | + |
| 85 | + if (args.replay) { |
| 86 | + // In replay mode, compare with existing local file |
| 87 | + const localData = await readFile(shardPath); |
| 88 | + if (localData.length !== uploadData.length) { |
| 89 | + console.error(`❌ Shard data mismatch: ${shardFilename}`); |
| 90 | + console.error(` Local size: ${localData.length}, Upload size: ${uploadData.length}`); |
| 91 | + throw new Error(`Shard validation failed for ${shardFilename}`); |
| 92 | + } |
| 93 | + |
| 94 | + // Compare all bytes except footer bytes 104-112 (9 bytes from positions 104-112 inclusive) |
| 95 | + const footerStart = Number( |
| 96 | + new DataView(localData.buffer).getBigUint64(localData.buffer.byteLength - 8, true) |
| 97 | + ); |
| 98 | + // This is the shard timestamp |
| 99 | + const toIgnoreStart = footerStart + 104; |
| 100 | + const toIgnoreEnd = footerStart + 112; |
| 101 | + |
| 102 | + const mismatch = localData.some((byte, i) => { |
| 103 | + if (i >= toIgnoreStart && i < toIgnoreEnd) { |
| 104 | + return false; |
| 105 | + } |
| 106 | + return byte !== uploadData[i]; |
| 107 | + }); |
| 108 | + |
| 109 | + if (mismatch) { |
| 110 | + console.error(`❌ Shard data mismatch: ${shardFilename}`); |
| 111 | + console.error(` Local size: ${localData.length}, Upload size: ${uploadData.length}`); |
| 112 | + throw new Error(`Shard validation failed for ${shardFilename}`); |
| 113 | + } |
| 114 | + console.log(`✅ Shard validation passed: ${shardFilename} - shard file is the same as generated previously`); |
| 115 | + |
| 116 | + // Do not mock the shard call |
| 117 | + //return new Response(null, { status: 200 }); |
| 118 | + } else { |
| 119 | + // In normal mode, save the data |
| 120 | + await writeFile(shardPath, uploadData); |
| 121 | + console.log(`💾 Saved shard to ${shardFilename} (${uploadData.length} bytes)`); |
| 122 | + } |
| 123 | + } |
| 124 | + |
| 125 | + // Forward the real request to backend |
| 126 | + const realResponse = await fetch(input, init); |
| 127 | + console.log(`📤 Shard upload ${shardCount}: ${realResponse.status} ${realResponse.statusText}`); |
| 128 | + return realResponse; |
| 129 | + } |
| 130 | + |
| 131 | + // Handle dedup info requests - save or replay locally |
| 132 | + if (url.includes("/chunks/")) { |
| 133 | + if (args.replay) { |
| 134 | + // In replay mode, try to load from local files |
| 135 | + const urlParts = url.split("/"); |
| 136 | + const chunkHash = urlParts[urlParts.length - 1]; |
| 137 | + const dedupFilename = `dedup_${chunkHash}_shard.bin`; |
| 138 | + const dedupPath = join(args.debugDir, dedupFilename); |
| 139 | + |
| 140 | + try { |
| 141 | + const localData = await readFile(dedupPath); |
| 142 | + console.log(`🔄 Replaying dedup info from ${dedupFilename}`); |
| 143 | + return new Response(localData, { status: 200 }); |
| 144 | + } catch (error) { |
| 145 | + return new Response(null, { status: 404 }); |
| 146 | + } |
| 147 | + } |
| 148 | + |
| 149 | + // Forward to real backend and save response |
| 150 | + const realResponse = await fetch(input, init); |
| 151 | + |
| 152 | + if (realResponse.ok && realResponse.body) { |
| 153 | + const urlParts = url.split("/"); |
| 154 | + const chunkHash = urlParts[urlParts.length - 1]; |
| 155 | + const dedupFilename = `dedup_${chunkHash}_shard.bin`; |
| 156 | + const dedupPath = join(args.debugDir, dedupFilename); |
| 157 | + |
| 158 | + const responseData = await realResponse.arrayBuffer(); |
| 159 | + await writeFile(dedupPath, new Uint8Array(responseData)); |
| 160 | + |
| 161 | + dedupShardCount++; |
| 162 | + console.log(`💾 Saved dedup info to ${dedupFilename} (${responseData.byteLength} bytes)`); |
| 163 | + |
| 164 | + // Return a new response with the same data |
| 165 | + return new Response(responseData, { |
| 166 | + status: realResponse.status, |
| 167 | + statusText: realResponse.statusText, |
| 168 | + headers: realResponse.headers, |
| 169 | + }); |
| 170 | + } |
| 171 | + |
| 172 | + return realResponse; |
| 173 | + } |
| 174 | + |
| 175 | + // For all other requests, use real fetch |
| 176 | + return fetch(input, init); |
| 177 | + }; |
| 178 | + |
| 179 | + return { |
| 180 | + fetch: debugFetch, |
| 181 | + getStats: () => ({ xorbCount, shardCount, dedupShardCount }), |
| 182 | + }; |
| 183 | +} |
| 184 | + |
| 185 | +async function* createFileSource(filepath: string): AsyncGenerator<{ |
| 186 | + content: Blob; |
| 187 | + path: string; |
| 188 | + sha256: string; |
| 189 | +}> { |
| 190 | + const filename = basename(filepath); |
| 191 | + console.log(`Processing ${filename}...`); |
| 192 | + |
| 193 | + const blob: Blob = await FileBlob.create(filepath); |
| 194 | + |
| 195 | + // Calculate sha256 |
| 196 | + console.log(`Calculating SHA256 for ${filename}...`); |
| 197 | + const sha256Iterator = sha256(blob, { useWebWorker: false }); |
| 198 | + let res: IteratorResult<number, string>; |
| 199 | + do { |
| 200 | + res = await sha256Iterator.next(); |
| 201 | + } while (!res.done); |
| 202 | + const sha256Hash = res.value; |
| 203 | + |
| 204 | + console.log(`SHA256 for ${filename}: ${sha256Hash}`); |
| 205 | + |
| 206 | + yield { |
| 207 | + content: blob, |
| 208 | + path: filename, |
| 209 | + sha256: sha256Hash, |
| 210 | + }; |
| 211 | +} |
| 212 | + |
| 213 | +async function main() { |
| 214 | + const { values: args } = parseArgs({ |
| 215 | + options: { |
| 216 | + token: { |
| 217 | + type: "string", |
| 218 | + short: "t", |
| 219 | + }, |
| 220 | + repo: { |
| 221 | + type: "string", |
| 222 | + short: "r", |
| 223 | + }, |
| 224 | + file: { |
| 225 | + type: "string", |
| 226 | + short: "f", |
| 227 | + }, |
| 228 | + replay: { |
| 229 | + type: "boolean", |
| 230 | + default: false, |
| 231 | + }, |
| 232 | + }, |
| 233 | + }); |
| 234 | + |
| 235 | + if (!args.token || !args.repo || !args.file) { |
| 236 | + console.error("Usage: pnpm --filter hub debug-xet -f <local_file> -t <write_token> -r <xet_repo>"); |
| 237 | + console.error("Example: pnpm --filter hub debug-xet -f ./model.bin -t hf_... -r myuser/myrepo"); |
| 238 | + console.error("Options:"); |
| 239 | + console.error(" --replay Use local dedup info instead of remote"); |
| 240 | + process.exit(1); |
| 241 | + } |
| 242 | + |
| 243 | + if (!existsSync(args.file)) { |
| 244 | + console.error(`❌ File ${args.file} does not exist`); |
| 245 | + process.exit(1); |
| 246 | + } |
| 247 | + |
| 248 | + const filename = basename(args.file); |
| 249 | + const debugDir = `upload_${filename}`; |
| 250 | + |
| 251 | + // Handle debug directory based on mode |
| 252 | + if (args.replay) { |
| 253 | + // In replay mode, directory must exist |
| 254 | + if (!existsSync(debugDir)) { |
| 255 | + console.error(`❌ Debug directory ${debugDir} does not exist`); |
| 256 | + console.error(` Run without --replay first to capture upload data`); |
| 257 | + process.exit(1); |
| 258 | + } |
| 259 | + console.log(`📁 Using existing debug directory: ${debugDir}`); |
| 260 | + } else { |
| 261 | + // In normal mode, directory must not exist |
| 262 | + if (existsSync(debugDir)) { |
| 263 | + console.error(`❌ Debug directory ${debugDir} already exists`); |
| 264 | + console.error(` Please remove it first: rm -rf ${debugDir}`); |
| 265 | + process.exit(1); |
| 266 | + } |
| 267 | + |
| 268 | + // Create debug directory |
| 269 | + await mkdir(debugDir, { recursive: true }); |
| 270 | + console.log(`📁 Created debug directory: ${debugDir}`); |
| 271 | + } |
| 272 | + |
| 273 | + // Parse repo |
| 274 | + const repo: RepoId = toRepoId(args.repo); |
| 275 | + |
| 276 | + // Create debug fetch |
| 277 | + const debugFetchObj = createDebugFetch({ |
| 278 | + debugDir, |
| 279 | + replay: args.replay, |
| 280 | + }); |
| 281 | + |
| 282 | + // Setup upload parameters |
| 283 | + const uploadParams = { |
| 284 | + accessToken: args.token, |
| 285 | + hubUrl: "https://huggingface.co", |
| 286 | + fetch: debugFetchObj.fetch, |
| 287 | + repo, |
| 288 | + rev: "main", |
| 289 | + }; |
| 290 | + |
| 291 | + console.log(`\n=== Starting debug upload for ${filename} ===`); |
| 292 | + if (args.replay) { |
| 293 | + console.log("🔄 Replay mode: Using local dedup info when available"); |
| 294 | + } |
| 295 | + |
| 296 | + // Get file stats |
| 297 | + const fileStats = await stat(args.file); |
| 298 | + console.log(`📄 File size: ${(fileStats.size / 1024 / 1024).toFixed(2)} MB`); |
| 299 | + |
| 300 | + // Process file through uploadShards |
| 301 | + const fileSource = createFileSource(args.file); |
| 302 | + |
| 303 | + let dedupRatio = 0; |
| 304 | + let fileSha256 = ""; |
| 305 | + |
| 306 | + for await (const event of uploadShards(fileSource, uploadParams)) { |
| 307 | + switch (event.event) { |
| 308 | + case "file": { |
| 309 | + console.log(`\n✅ Upload completed for: ${event.path}`); |
| 310 | + console.log(` SHA256: ${event.sha256}`); |
| 311 | + console.log(` Dedup ratio: ${(event.dedupRatio * 100).toFixed(2)}%`); |
| 312 | + |
| 313 | + dedupRatio = event.dedupRatio; |
| 314 | + fileSha256 = event.sha256; |
| 315 | + break; |
| 316 | + } |
| 317 | + |
| 318 | + case "fileProgress": { |
| 319 | + // Progress already logged in yieldCallback |
| 320 | + break; |
| 321 | + } |
| 322 | + } |
| 323 | + } |
| 324 | + |
| 325 | + // Get final stats from debug fetch |
| 326 | + const stats = debugFetchObj.getStats(); |
| 327 | + |
| 328 | + console.log("\n=== DEBUG UPLOAD RESULTS ==="); |
| 329 | + console.log(`📁 Debug directory: ${debugDir}`); |
| 330 | + console.log(`📄 Original file: ${filename} (${(fileStats.size / 1024 / 1024).toFixed(2)} MB)`); |
| 331 | + console.log(`🔒 SHA256: ${fileSha256}`); |
| 332 | + console.log(`📊 Deduplication: ${(dedupRatio * 100).toFixed(2)}%`); |
| 333 | + console.log(`📤 Network calls:`); |
| 334 | + console.log(` - ${stats.xorbCount} xorb uploads`); |
| 335 | + console.log(` - ${stats.shardCount} shard uploads`); |
| 336 | + console.log(` - ${stats.dedupShardCount} dedup info downloads`); |
| 337 | + |
| 338 | + // List all captured files |
| 339 | + const capturedFiles = await readdir(debugDir); |
| 340 | + console.log(`\n💾 Captured ${capturedFiles.length} files:`); |
| 341 | + for (const file of capturedFiles.sort()) { |
| 342 | + const filePath = join(debugDir, file); |
| 343 | + const fileInfo = await stat(filePath); |
| 344 | + console.log(` - ${file} (${fileInfo.size.toLocaleString()} bytes)`); |
| 345 | + } |
| 346 | + |
| 347 | + if (args.replay) { |
| 348 | + console.log(`\n✅ Replay validation completed successfully!`); |
| 349 | + console.log(` All uploaded data matched local files`); |
| 350 | + } else { |
| 351 | + console.log(`\n🚀 Debug upload completed successfully!`); |
| 352 | + console.log(` Use --replay flag to test with local dedup data`); |
| 353 | + } |
| 354 | +} |
| 355 | + |
| 356 | +main().catch((error) => { |
| 357 | + console.error("❌ Error:", error); |
| 358 | + process.exit(1); |
| 359 | +}); |
0 commit comments