Skip to content

Commit 429802e

Browse files
committed
feat(_put_file): dynamically adjust chunksize based on file size
`_put_file` will automatically increase the `chunksize` when uploading a large file to remain within 10,000 chunks limit.
1 parent 2ccadeb commit 429802e

File tree

1 file changed

+15
-3
lines changed

1 file changed

+15
-3
lines changed

s3fs/core.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import errno
44
import io
55
import logging
6+
import math
67
import mimetypes
78
import os
89
import socket
@@ -69,6 +70,8 @@ def setup_logging(level=None):
6970
ResponseParserError,
7071
)
7172

73+
MAX_UPLOAD_PARTS = 10_000 # maximum number of parts for S3 multipart upload
74+
7275
if ClientPayloadError is not None:
7376
S3_RETRYABLE_ERRORS += (ClientPayloadError,)
7477

@@ -1230,7 +1233,7 @@ async def _put_file(
12301233
lpath,
12311234
rpath,
12321235
callback=_DEFAULT_CALLBACK,
1233-
chunksize=50 * 2**20,
1236+
chunksize=None,
12341237
max_concurrency=None,
12351238
mode="overwrite",
12361239
**kwargs,
@@ -1258,6 +1261,15 @@ async def _put_file(
12581261
if content_type is not None:
12591262
kwargs["ContentType"] = content_type
12601263

1264+
if chunksize is None:
1265+
chunksize = 50 * 2**20 # default chunksize set to 50 MiB
1266+
required_chunks = math.ceil(size / chunksize)
1267+
# increase chunksize to fit within the MAX_UPLOAD_PARTS limit
1268+
if required_chunks > MAX_UPLOAD_PARTS:
1269+
# S3 supports uploading objects up to 5 TiB in size,
1270+
# so each chunk can be up to ~524 MiB.
1271+
chunksize = math.ceil(size / MAX_UPLOAD_PARTS)
1272+
12611273
with open(lpath, "rb") as f0:
12621274
if size < min(5 * 2**30, 2 * chunksize):
12631275
chunk = f0.read()
@@ -1276,8 +1288,8 @@ async def _put_file(
12761288
key,
12771289
mpu,
12781290
f0,
1291+
chunksize,
12791292
callback=callback,
1280-
chunksize=chunksize,
12811293
max_concurrency=max_concurrency,
12821294
)
12831295
parts = [
@@ -1305,8 +1317,8 @@ async def _upload_file_part_concurrent(
13051317
key,
13061318
mpu,
13071319
f0,
1320+
chunksize,
13081321
callback=_DEFAULT_CALLBACK,
1309-
chunksize=50 * 2**20,
13101322
max_concurrency=None,
13111323
):
13121324
max_concurrency = max_concurrency or self.max_concurrency

0 commit comments

Comments
 (0)