Skip to content

Commit ae307fd

Browse files
committed
Replace parquet.js with hyparquet
1 parent 89c726d commit ae307fd

File tree

3 files changed

+14
-110
lines changed

3 files changed

+14
-110
lines changed

package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,14 @@
2626
"dependencies": {
2727
"@types/node": "^18.11.18",
2828
"@types/node-fetch": "^2.6.4",
29-
"@types/parquetjs": "^0.10.6",
3029
"@types/progress-stream": "^2.0.5",
3130
"abort-controller": "^3.0.0",
3231
"agentkeepalive": "^4.2.1",
3332
"axios": "^1.7.7",
3433
"form-data-encoder": "1.7.2",
3534
"formdata-node": "^4.3.2",
35+
"hyparquet": "^1.6.3",
3636
"node-fetch": "^2.6.7",
37-
"parquetjs": "^0.11.2",
3837
"progress-stream": "^2.0.0"
3938
},
4039
"devDependencies": {

src/lib/upload.ts

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ import fetch from 'node-fetch';
77
import * as path from 'path';
88
import progress from 'progress-stream';
99
import readline from 'readline';
10-
import pkg from 'parquetjs';
11-
const { ParquetReader } = pkg;
10+
import { asyncBufferFromFile, parquetMetadataAsync, parquetSchema, SchemaTree } from 'hyparquet';
1211

1312
export interface FileResponse {
1413
id: string;
@@ -76,29 +75,27 @@ export async function check_file(fileName: string): Promise<CheckFileResponse> {
7675

7776
export async function check_parquet(fileName: string): Promise<string | undefined> {
7877
try {
79-
const reader = await ParquetReader.openFile(fileName);
80-
const cursor = reader.getCursor();
81-
let record = null;
78+
const asyncBuffer = await asyncBufferFromFile(fileName);
79+
const metadata = await parquetMetadataAsync(asyncBuffer);
80+
const { children } = parquetSchema(metadata);
8281

83-
const fieldNames = Object.keys(reader.schema.fields);
84-
if (!('input_ids' in fieldNames)) {
82+
const fieldNames = children.map((child: SchemaTree) => child.element.name);
83+
if (!fieldNames.includes('input_ids')) {
8584
return `Parquet file ${fileName} does not contain the 'input_ids' column.`;
8685
}
8786

88-
for (const fieldName in fieldNames) {
87+
for (const fieldName of fieldNames) {
8988
if (!PARQUET_EXPECTED_COLUMNS.includes(fieldName)) {
9089
return `Parquet file ${fileName} contains unexpected column ${fieldName}. Only ${PARQUET_EXPECTED_COLUMNS.join(
9190
', ',
9291
)} are supported`;
9392
}
9493
}
9594

96-
const numRows = reader.getRowCount() as unknown as number;
95+
const numRows = metadata.num_rows;
9796
if (numRows < MIN_SAMPLES) {
9897
return `Parquet file ${fileName} contains only ${numRows} samples. Minimum of ${MIN_SAMPLES} samples are required`;
9998
}
100-
101-
await reader.close();
10299
} catch (err) {
103100
return `failed to read parquet file ${fileName}`;
104101
}

yarn.lock

Lines changed: 5 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -859,13 +859,6 @@
859859
"@types/node" "*"
860860
form-data "^4.0.0"
861861

862-
"@types/node-int64@*":
863-
version "0.4.32"
864-
resolved "https://registry.yarnpkg.com/@types/node-int64/-/node-int64-0.4.32.tgz#a540bcb9e48816ca1b5329d1ab907d6ad134b856"
865-
integrity sha512-xf/JsSlnXQ+mzvc0IpXemcrO4BrCfpgNpMco+GLcXkFk01k/gW9lGJu+Vof0ZSvHK6DsHJDPSbjFPs36QkWXqw==
866-
dependencies:
867-
"@types/node" "*"
868-
869862
"@types/node@*":
870863
version "20.10.5"
871864
resolved "https://registry.yarnpkg.com/@types/node/-/node-20.10.5.tgz#47ad460b514096b7ed63a1dae26fad0914ed3ab2"
@@ -878,13 +871,6 @@
878871
resolved "https://registry.yarnpkg.com/@types/node/-/node-18.11.18.tgz#8dfb97f0da23c2293e554c5a50d61ef134d7697f"
879872
integrity sha512-DHQpWGjyQKSHj3ebjFI/wRKcqQcdR+MoFBygntYOZytCqNfkd2ZC4ARDJ2DQqhjH5p85Nnd3jhUJIXrszFX/JA==
880873

881-
"@types/parquetjs@^0.10.6":
882-
version "0.10.6"
883-
resolved "https://registry.yarnpkg.com/@types/parquetjs/-/parquetjs-0.10.6.tgz#7e4b54d9d336a8dda9c7a9091ec7f60db98744af"
884-
integrity sha512-ZCsD6j97YD0mGU8/VnVs3NjORXa7zeHvqlpJpCqy4jU8a1O21dalL+MFn9QNbdEfy8rszR1N7NHeT7/LdtHf+A==
885-
dependencies:
886-
"@types/node-int64" "*"
887-
888874
"@types/progress-stream@^2.0.5":
889875
version "2.0.5"
890876
resolved "https://registry.yarnpkg.com/@types/progress-stream/-/progress-stream-2.0.5.tgz#50f10be88b0717c8fce6573e7fcafa8eabbc3dcf"
@@ -1194,21 +1180,11 @@ balanced-match@^1.0.0:
11941180
resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.2.tgz#e83e3a7e3f300b34cb9d87f615fa0cbf357690ee"
11951181
integrity sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==
11961182

1197-
base64-js@^1.1.2:
1198-
version "1.5.1"
1199-
resolved "https://registry.yarnpkg.com/base64-js/-/base64-js-1.5.1.tgz#1b1b440160a5bf7ad40b650f095963481903930a"
1200-
integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==
1201-
12021183
big-integer@^1.6.44:
12031184
version "1.6.52"
12041185
resolved "https://registry.yarnpkg.com/big-integer/-/big-integer-1.6.52.tgz#60a887f3047614a8e1bffe5d7173490a97dc8c85"
12051186
integrity sha512-QxD8cf2eVqJOOz63z6JIN9BzvVs/dlySa5HGSBH5xtR8dPteIRQnBxxKqkNTiT6jbDTF6jAfrd4oMcND9RGbQg==
12061187

1207-
bindings@~1.2.1:
1208-
version "1.2.1"
1209-
resolved "https://registry.yarnpkg.com/bindings/-/bindings-1.2.1.tgz#14ad6113812d2d37d72e67b4cacb4bb726505f11"
1210-
integrity sha512-u4cBQNepWxYA55FunZSM7wMi55yQaN0otnhhilNoWHq0MfOfJeQx0v0mRRpolGOExPjZcl6FtB0BB8Xkb88F0g==
1211-
12121188
bplist-parser@^0.2.0:
12131189
version "0.2.0"
12141190
resolved "https://registry.yarnpkg.com/bplist-parser/-/bplist-parser-0.2.0.tgz#43a9d183e5bf9d545200ceac3e712f79ebbe8d0e"
@@ -1238,13 +1214,6 @@ braces@^3.0.3:
12381214
dependencies:
12391215
fill-range "^7.1.1"
12401216

1241-
brotli@^1.3.0:
1242-
version "1.3.3"
1243-
resolved "https://registry.yarnpkg.com/brotli/-/brotli-1.3.3.tgz#7365d8cc00f12cf765d2b2c898716bcf4b604d48"
1244-
integrity sha512-oTKjJdShmDuGW94SyyaoQvAjf30dZaHnjJ8uAF+u2/vGJkJbJPJAT1gDiOJP5v1Zb6f9KEyW/1HpuaWIXtGHPg==
1245-
dependencies:
1246-
base64-js "^1.1.2"
1247-
12481217
browserslist@^4.22.2:
12491218
version "4.22.2"
12501219
resolved "https://registry.yarnpkg.com/browserslist/-/browserslist-4.22.2.tgz#704c4943072bd81ea18997f3bd2180e89c77874b"
@@ -1269,11 +1238,6 @@ [email protected]:
12691238
dependencies:
12701239
node-int64 "^0.4.0"
12711240

1272-
bson@^1.0.4:
1273-
version "1.1.6"
1274-
resolved "https://registry.yarnpkg.com/bson/-/bson-1.1.6.tgz#fb819be9a60cd677e0853aee4ca712a785d6618a"
1275-
integrity sha512-EvVNVeGo4tHxwi8L6bPj3y3itEvStdwvvlojVxxbyYfoaxJ6keLgrTuKdyfEAszFK+H3olzBuafE0yoh0D1gdg==
1276-
12771241
buffer-from@^1.0.0:
12781242
version "1.1.2"
12791243
resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.2.tgz#2b146a6fd72e80b4f55d255f35ed59a3a9a41bd5"
@@ -1998,6 +1962,11 @@ humanize-ms@^1.2.1:
19981962
dependencies:
19991963
ms "^2.0.0"
20001964

1965+
hyparquet@^1.6.3:
1966+
version "1.6.3"
1967+
resolved "https://registry.yarnpkg.com/hyparquet/-/hyparquet-1.6.3.tgz#8ddc84c0023cc00bc5ef33246df48ba23b36c38d"
1968+
integrity sha512-JwD3bcRueKs7/0iQG8xyJxh6OAXSQdzh0dJkMtzQPSDOTuvoKniDgdOPB9oXGspbUhp5c9uW6coqYiFddKJ+cw==
1969+
20011970
iconv-lite@^0.6.3:
20021971
version "0.6.3"
20031972
resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.3.tgz#a52f80bf38da1952eb5c681790719871a1a72501"
@@ -2049,11 +2018,6 @@ inherits@2, inherits@^2.0.3, inherits@~2.0.3:
20492018
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
20502019
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
20512020

2052-
int53@^0.2.4:
2053-
version "0.2.4"
2054-
resolved "https://registry.yarnpkg.com/int53/-/int53-0.2.4.tgz#5ed8d7aad6c5c6567cae69aa7ffc4a109ee80f86"
2055-
integrity sha512-a5jlKftS7HUOhkUyYD7j2sJ/ZnvWiNlZS1ldR+g1ifQ+/UuZXIE+YTc/lK1qGj/GwAU5F8Z0e1eVq2t1J5Ob2g==
2056-
20572021
is-arrayish@^0.2.1:
20582022
version "0.2.1"
20592023
resolved "https://registry.yarnpkg.com/is-arrayish/-/is-arrayish-0.2.1.tgz#77c99840527aa8ecb1a8ba697b80645a7a926a9d"
@@ -2664,13 +2628,6 @@ lru-cache@^6.0.0:
26642628
dependencies:
26652629
yallist "^4.0.0"
26662630

2667-
lzo@^0.4.0:
2668-
version "0.4.11"
2669-
resolved "https://registry.yarnpkg.com/lzo/-/lzo-0.4.11.tgz#0e76d582567b29e285cb84a6aa392cb94c6283f8"
2670-
integrity sha512-apQHNoW2Alg72FMqaC/7pn03I7umdgSVFt2KRkCXXils4Z9u3QBh1uOtl2O5WmZIDLd9g6Lu4lIdOLmiSTFVCQ==
2671-
dependencies:
2672-
bindings "~1.2.1"
2673-
26742631
make-dir@^4.0.0:
26752632
version "4.0.0"
26762633
resolved "https://registry.yarnpkg.com/make-dir/-/make-dir-4.0.0.tgz#c3c2307a771277cd9638305f915c29ae741b614e"
@@ -2805,11 +2762,6 @@ npm-run-path@^5.1.0:
28052762
dependencies:
28062763
path-key "^4.0.0"
28072764

2808-
2809-
version "0.0.1"
2810-
resolved "https://registry.yarnpkg.com/object-stream/-/object-stream-0.0.1.tgz#3a03a26e94fd112c9abffeb4651e07a5e23cf840"
2811-
integrity sha512-+NPJnRvX9RDMRY9mOWOo/NDppBjbZhXirNNSu2IBnuNboClC9h1ZGHXgHBLDbJMHsxeJDq922aVmG5xs24a/cA==
2812-
28132765
once@^1.3.0:
28142766
version "1.4.0"
28152767
resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"
@@ -2907,21 +2859,6 @@ parent-module@^1.0.0:
29072859
dependencies:
29082860
callsites "^3.0.0"
29092861

2910-
parquetjs@^0.11.2:
2911-
version "0.11.2"
2912-
resolved "https://registry.yarnpkg.com/parquetjs/-/parquetjs-0.11.2.tgz#ea13221b3583cb1277f8b4b879776420f8863660"
2913-
integrity sha512-Y6FOc3Oi2AxY4TzJPz7fhICCR8tQNL3p+2xGQoUAMbmlJBR7+JJmMrwuyMjIpDiM7G8Wj/8oqOH4UDUmu4I5ZA==
2914-
dependencies:
2915-
brotli "^1.3.0"
2916-
bson "^1.0.4"
2917-
int53 "^0.2.4"
2918-
object-stream "0.0.1"
2919-
snappyjs "^0.6.0"
2920-
thrift "^0.11.0"
2921-
varint "^5.0.0"
2922-
optionalDependencies:
2923-
lzo "^0.4.0"
2924-
29252862
parse-json@^5.2.0:
29262863
version "5.2.0"
29272864
resolved "https://registry.yarnpkg.com/parse-json/-/parse-json-5.2.0.tgz#c76fc66dee54231c962b22bcc8a72cf2f99753cd"
@@ -3046,11 +2983,6 @@ pure-rand@^6.0.0:
30462983
resolved "https://registry.yarnpkg.com/pure-rand/-/pure-rand-6.0.4.tgz#50b737f6a925468679bff00ad20eade53f37d5c7"
30472984
integrity sha512-LA0Y9kxMYv47GIPJy6MI84fqTd2HmYZI83W/kM/SkKfDlajnZYfmXFTxkbY+xSBPkLJxltMa9hIkmdc29eguMA==
30482985

3049-
q@^1.5.0:
3050-
version "1.5.1"
3051-
resolved "https://registry.yarnpkg.com/q/-/q-1.5.1.tgz#7e32f75b41381291d04611f1bf14109ac00651d7"
3052-
integrity sha512-kV/CThkXo6xyFEZUugw/+pIOywXcDbFYgSct5cT3gqlbkBE1SJdwy6UQoZvodiWF/ckQLZyDE/Bu1M6gVu5lVw==
3053-
30542986
queue-microtask@^1.2.2:
30552987
version "1.2.3"
30562988
resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243"
@@ -3204,11 +3136,6 @@ slash@^3.0.0:
32043136
resolved "https://registry.yarnpkg.com/slash/-/slash-3.0.0.tgz#6539be870c165adbd5240220dbe361f1bc4d4634"
32053137
integrity sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==
32063138

3207-
snappyjs@^0.6.0:
3208-
version "0.6.1"
3209-
resolved "https://registry.yarnpkg.com/snappyjs/-/snappyjs-0.6.1.tgz#9bca9ff8c54b133a9cc84a71d22779e97fc51878"
3210-
integrity sha512-YIK6I2lsH072UE0aOFxxY1dPDCS43I5ktqHpeAsuLNYWkE5pGxRGWfDM4/vSUfNzXjC1Ivzt3qx31PCLmc9yqg==
3211-
32123139
32133140
version "0.5.13"
32143141
resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.13.tgz#31b24a9c2e73c2de85066c0feb7d44767ed52932"
@@ -3362,15 +3289,6 @@ text-table@^0.2.0:
33623289
resolved "https://registry.yarnpkg.com/text-table/-/text-table-0.2.0.tgz#7f5ee823ae805207c00af2df4a84ec3fcfa570b4"
33633290
integrity sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==
33643291

3365-
thrift@^0.11.0:
3366-
version "0.11.0"
3367-
resolved "https://registry.yarnpkg.com/thrift/-/thrift-0.11.0.tgz#256115e4ff87871e12537f4b510bd2b425e13990"
3368-
integrity sha512-UpsBhOC45a45TpeHOXE4wwYwL8uD2apbHTbtBvkwtUU4dNwCjC7DpQTjw2Q6eIdfNtw+dKthdwq94uLXTJPfFw==
3369-
dependencies:
3370-
node-int64 "^0.4.0"
3371-
q "^1.5.0"
3372-
ws ">= 2.2.3"
3373-
33743292
through2@~2.0.3:
33753293
version "2.0.5"
33763294
resolved "https://registry.yarnpkg.com/through2/-/through2-2.0.5.tgz#01c1e39eb31d07cb7d03a96a70823260b23132cd"
@@ -3550,11 +3468,6 @@ v8-to-istanbul@^9.0.1:
35503468
"@types/istanbul-lib-coverage" "^2.0.1"
35513469
convert-source-map "^2.0.0"
35523470

3553-
varint@^5.0.0:
3554-
version "5.0.2"
3555-
resolved "https://registry.yarnpkg.com/varint/-/varint-5.0.2.tgz#5b47f8a947eb668b848e034dcfa87d0ff8a7f7a4"
3556-
integrity sha512-lKxKYG6H03yCZUpAGOPOsMcGxd1RHCu1iKvEHYDPmTyq2HueGhD73ssNBqqQWfvYs04G9iUFRvmAVLW20Jw6ow==
3557-
35583471
walker@^1.0.8:
35593472
version "1.0.8"
35603473
resolved "https://registry.yarnpkg.com/walker/-/walker-1.0.8.tgz#bd498db477afe573dc04185f011d3ab8a8d7653f"
@@ -3609,11 +3522,6 @@ write-file-atomic@^4.0.2:
36093522
imurmurhash "^0.1.4"
36103523
signal-exit "^3.0.7"
36113524

3612-
"ws@>= 2.2.3":
3613-
version "8.18.0"
3614-
resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.0.tgz#0d7505a6eafe2b0e712d232b42279f53bc289bbc"
3615-
integrity sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==
3616-
36173525
xtend@~4.0.1:
36183526
version "4.0.2"
36193527
resolved "https://registry.yarnpkg.com/xtend/-/xtend-4.0.2.tgz#bb72779f5fa465186b1f438f674fa347fdb5db54"

0 commit comments

Comments
 (0)