From e87a1f34b2a9140aa9e8aa1f261a195057a7fbd3 Mon Sep 17 00:00:00 2001 From: Xutong Ren Date: Tue, 2 Jun 2026 15:54:53 -0700 Subject: [PATCH] [storage] Add configurable S3 read timeout (OSMO_S3_READ_TIMEOUT) The S3 client (also used for Swift via its S3-compatible API) never set a read_timeout, so botocore's 60s default applied. For large single-PUT objects on backends like Swift, the server replicates and checksums the whole object before responding, which can exceed 60s. The client then raises ReadTimeoutError and retries the entire upload, looping indefinitely even though the bytes transferred fine. Add a configurable read_timeout (env OSMO_S3_READ_TIMEOUT, default 5m) to the botocore Config, mirroring the existing _get_s3_timeout pattern. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/lib/data/storage/backends/s3.py | 19 +++++++++++++++++++ .../storage/backends/tests/test_backends.py | 13 +++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/lib/data/storage/backends/s3.py b/src/lib/data/storage/backends/s3.py index 43c575c01..e2623d3eb 100644 --- a/src/lib/data/storage/backends/s3.py +++ b/src/lib/data/storage/backends/s3.py @@ -52,6 +52,10 @@ # Timeout for S3 API calls. OSMO_S3_TIMEOUT = 'OSMO_S3_TIMEOUT' +# Read timeout for the underlying S3 client (time to wait for a server response +# after a request has been sent). +OSMO_S3_READ_TIMEOUT = 'OSMO_S3_READ_TIMEOUT' + # Max retry count for retryable S3 API calls. OSMO_S3_MAX_RETRY_COUNT = 'OSMO_S3_MAX_RETRY_COUNT' @@ -94,6 +98,20 @@ def _get_s3_timeout() -> datetime.timedelta: return utils_common.to_timedelta(os.getenv(OSMO_S3_TIMEOUT, '24h')) +def _get_s3_read_timeout() -> float: + """ + Returns the read timeout (in seconds) for the underlying S3 client. + + This bounds how long the client waits for a server response after a request + has been fully sent. botocore defaults to 60s, which is too short to + finalize large single-PUT objects on some backends (e.g. Swift), where the + server replicates and checksums the whole object before responding. That + produces spurious read timeouts and full-object retries, so we default to a + more generous 5m. + """ + return utils_common.to_timedelta(os.getenv(OSMO_S3_READ_TIMEOUT, '5m')).total_seconds() + + _VALID_S3_ADDRESSING_STYLES = ('virtual', 'path', 'auto') @@ -184,6 +202,7 @@ def _get_boto_config( 'mode': 'standard', }, 'max_pool_connections': max_pool_connections, + 'read_timeout': _get_s3_read_timeout(), 'request_checksum_calculation': 'when_required', 'response_checksum_validation': 'when_required', } diff --git a/src/lib/data/storage/backends/tests/test_backends.py b/src/lib/data/storage/backends/tests/test_backends.py index 12b6ff14b..db1d1edce 100644 --- a/src/lib/data/storage/backends/tests/test_backends.py +++ b/src/lib/data/storage/backends/tests/test_backends.py @@ -707,6 +707,19 @@ def test_env_override_to_path(self): config = s3._get_boto_config('s3', endpoint_url='https://cwobject.com') self.assertEqual(self._s3_options(config).get('addressing_style'), 'path') + def test_default_read_timeout(self): + """Default read_timeout is 5m (300s), overriding botocore's 60s default.""" + with mock.patch.dict('os.environ', {}, clear=True): + config = s3._get_boto_config('swift', endpoint_url='https://swift.example.com') + # read_timeout is not in botocore's type stubs — read defensively. + self.assertEqual(getattr(config, 'read_timeout'), 300) + + def test_env_override_read_timeout(self): + """OSMO_S3_READ_TIMEOUT overrides the default read_timeout.""" + with mock.patch.dict('os.environ', {s3.OSMO_S3_READ_TIMEOUT: '10m'}): + config = s3._get_boto_config('s3', endpoint_url='https://cwobject.com') + self.assertEqual(getattr(config, 'read_timeout'), 600) + class CredentialAddressingStyleTest(unittest.TestCase): """Tests for carrying S3 addressing style through data credentials."""