42
42
# infinite loops in the case where a key is truly unprocessable. We allow for more retries than it should
43
43
# ever take to avoid failing restores due to transient issues.
44
44
MAX_UNPROCESSED_KEYS_RETRIES = 30
45
- # While the AWS maximum is 100, we set this to 10 to avoid hitting the 4MB limit for the transaction. See DAR-2637
46
- MAX_TRANSACT_WRITE_ITEMS = 10
47
45
log = logging .getLogger (__name__ )
48
46
T = TypeVar ("T" )
49
47
50
48
51
49
class DynamoDBStateStore :
52
- def __init__ (self , name , dynamodb_region , stopping = False ) -> None :
50
+ def __init__ (self , name , dynamodb_region , stopping = False , max_transact_write_items = 8 ) -> None :
53
51
# Standard mode includes an exponential backoff by a base factor of 2 for a
54
52
# maximum backoff time of 20 seconds (min(b*r^i, MAX_BACKOFF) where b is a
55
53
# random number between 0 and 1 and r is the base factor of 2). This might
@@ -70,6 +68,7 @@ def __init__(self, name, dynamodb_region, stopping=False) -> None:
70
68
self .dynamodb_region = dynamodb_region
71
69
self .table = self .dynamodb .Table (name )
72
70
self .stopping = stopping
71
+ self .max_transact_write_items = max_transact_write_items
73
72
self .save_queue : OrderedDict = OrderedDict ()
74
73
self .save_lock = threading .Lock ()
75
74
self .save_errors = 0
@@ -336,7 +335,7 @@ def _save_loop(self):
336
335
337
336
def __setitem__ (self , key : str , value : Tuple [bytes , str ]) -> None :
338
337
"""
339
- Partition the item and write up to MAX_TRANSACT_WRITE_ITEMS
338
+ Partition the item and write up to self.max_transact_write_items
340
339
partitions atomically using TransactWriteItems.
341
340
342
341
The function examines the size of pickled_val and json_val,
@@ -392,7 +391,7 @@ def __setitem__(self, key: str, value: Tuple[bytes, str]) -> None:
392
391
393
392
# We want to write the items when we've either reached the max number of items
394
393
# for a transaction, or when we're done processing all partitions
395
- if len (items ) == MAX_TRANSACT_WRITE_ITEMS or index == max_partitions - 1 :
394
+ if len (items ) == self . max_transact_write_items or index == max_partitions - 1 :
396
395
try :
397
396
self .client .transact_write_items (TransactItems = items )
398
397
items = []
@@ -401,6 +400,11 @@ def __setitem__(self, key: str, value: Tuple[bytes, str]) -> None:
401
400
name = "tron.dynamodb.setitem" ,
402
401
delta = time .time () - start ,
403
402
)
403
+ # TODO: TRON-2419 - We should be smarter here. While each batch is atomic, a sufficiently
404
+ # large JobRun could exceed the max size of a single transaction (e.g. a JobRun with 12
405
+ # partitions). While one batch might succeed (saving partitions 1-8), the next one (for
406
+ # partitions 9-12) might fail. We should to handle this case or we will see more hanging
407
+ # chads in DynamoDB.
404
408
log .exception (f"Failed to save partition for key: { key } " )
405
409
raise
406
410
timer (
0 commit comments