Merge pull request #1043 from Yelp/u/kkasp/TRON-2414-set-extra-safe-transaction-limit

KaspariK · web-flow · commit 856f6fc124e5 · 2025-05-01T13:52:59.000-04:00
Drop under the limit a bit more for maximum safety
diff --git a/tron/config/config_parse.py b/tron/config/config_parse.py
@@ -858,12 +858,18 @@ class ValidateStatePersistence(Validator):
     config_class = schema.ConfigState
     defaults = {
         "buffer_size": 1,
+        "dynamodb_region": None,
+        "table_name": None,
+        "max_transact_write_items": 8,
     }
 
     validators = {
         "name": valid_string,
         "store_type": config_utils.build_real_enum_validator(schema.StatePersistenceTypes),
         "buffer_size": valid_int,
+        "dynamodb_region": valid_string,
+        "table_name": valid_string,
+        "max_transact_write_items": valid_int,
     }
 
     def post_validation(self, config, config_context):
@@ -873,6 +879,22 @@ def post_validation(self, config, config_context):
             path = config_context.path
             raise ConfigError("%s buffer_size must be >= 1." % path)
 
+        store_type = config.get("store_type")
+
+        if store_type == schema.StatePersistenceTypes.dynamodb.value:
+            if not config.get("table_name"):
+                raise ConfigError(f"{config_context.path} table_name is required when store_type is 'dynamodb'")
+            if not config.get("dynamodb_region"):
+                raise ConfigError(f"{config_context.path} dynamodb_region is required when store_type is 'dynamodb'")
+
+            max_transact = config.get("max_transact_write_items")
+
+            # Upper bound is based on boto3 transact_write_items limit
+            if not 1 <= max_transact <= 100:
+                raise ConfigError(
+                    f"{config_context.path} max_transact_write_items must be between 1 and 100, got {max_transact}"
+                )
+
 
 valid_state_persistence = ValidateStatePersistence()
 
diff --git a/tron/config/schema.py b/tron/config/schema.py
@@ -106,6 +106,7 @@ def from_dict(cls, data: Dict[str, Any]):
         "buffer_size",
         "dynamodb_region",
         "table_name",
+        "max_transact_write_items",
     ],
 )
 
diff --git a/tron/serialize/runstate/dynamodb_state_store.py b/tron/serialize/runstate/dynamodb_state_store.py
@@ -42,14 +42,12 @@
 # infinite loops in the case where a key is truly unprocessable. We allow for more retries than it should
 # ever take to avoid failing restores due to transient issues.
 MAX_UNPROCESSED_KEYS_RETRIES = 30
-# While the AWS maximum is 100, we set this to 10 to avoid hitting the 4MB limit for the transaction. See DAR-2637
-MAX_TRANSACT_WRITE_ITEMS = 10
 log = logging.getLogger(__name__)
 T = TypeVar("T")
 
 
 class DynamoDBStateStore:
-    def __init__(self, name, dynamodb_region, stopping=False) -> None:
+    def __init__(self, name, dynamodb_region, stopping=False, max_transact_write_items=8) -> None:
         # Standard mode includes an exponential backoff by a base factor of 2 for a
         # maximum backoff time of 20 seconds (min(b*r^i, MAX_BACKOFF) where b is a
         # random number between 0 and 1 and r is the base factor of 2). This might
@@ -70,6 +68,7 @@ def __init__(self, name, dynamodb_region, stopping=False) -> None:
         self.dynamodb_region = dynamodb_region
         self.table = self.dynamodb.Table(name)
         self.stopping = stopping
+        self.max_transact_write_items = max_transact_write_items
         self.save_queue: OrderedDict = OrderedDict()
         self.save_lock = threading.Lock()
         self.save_errors = 0
@@ -336,7 +335,7 @@ def _save_loop(self):
 
     def __setitem__(self, key: str, value: Tuple[bytes, str]) -> None:
         """
-        Partition the item and write up to MAX_TRANSACT_WRITE_ITEMS
+        Partition the item and write up to self.max_transact_write_items
         partitions atomically using TransactWriteItems.
 
         The function examines the size of pickled_val and json_val,
@@ -392,7 +391,7 @@ def __setitem__(self, key: str, value: Tuple[bytes, str]) -> None:
 
             # We want to write the items when we've either reached the max number of items
             # for a transaction, or when we're done processing all partitions
-            if len(items) == MAX_TRANSACT_WRITE_ITEMS or index == max_partitions - 1:
+            if len(items) == self.max_transact_write_items or index == max_partitions - 1:
                 try:
                     self.client.transact_write_items(TransactItems=items)
                     items = []
@@ -401,6 +400,11 @@ def __setitem__(self, key: str, value: Tuple[bytes, str]) -> None:
                         name="tron.dynamodb.setitem",
                         delta=time.time() - start,
                     )
+                    # TODO: TRON-2419 - We should be smarter here. While each batch is atomic, a sufficiently
+                    # large JobRun could exceed the max size of a single transaction (e.g. a JobRun with 12
+                    # partitions). While one batch might succeed (saving partitions 1-8), the next one (for
+                    # partitions 9-12) might fail. We should to handle this case or we will see more hanging
+                    # chads in DynamoDB.
                     log.exception(f"Failed to save partition for key: {key}")
                     raise
         timer(
diff --git a/tron/serialize/runstate/statemanager.py b/tron/serialize/runstate/statemanager.py
@@ -48,7 +48,8 @@ def from_config(cls, persistence_config):
         if store_type == schema.StatePersistenceTypes.dynamodb:
             table_name = persistence_config.table_name
             dynamodb_region = persistence_config.dynamodb_region
-            store = DynamoDBStateStore(table_name, dynamodb_region)
+            max_transact_write_items = persistence_config.max_transact_write_items
+            store = DynamoDBStateStore(table_name, dynamodb_region, max_transact_write_items=max_transact_write_items)
 
         buffer = StateSaveBuffer(buffer_size)
         return PersistentStateManager(store, buffer)

Original file line number	Diff line number	Diff line change
`@@ -106,6 +106,7 @@ def from_dict(cls, data: Dict[str, Any]):`
`106`	`106`	`"buffer_size",`
`107`	`107`	`"dynamodb_region",`
`108`	`108`	`"table_name",`
	`109`	`+ "max_transact_write_items",`
`109`	`110`	`],`
`110`	`111`	`)`
`111`	`112`