-
Notifications
You must be signed in to change notification settings - Fork 123
[PECOBLR-782] added retry handling based on idempotency #689
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,10 +32,20 @@ | |
|
||
|
||
class CommandType(Enum): | ||
EXECUTE_STATEMENT = "ExecuteStatement" | ||
NOT_SET = "NotSet" | ||
OPEN_SESSION = "OpenSession" | ||
CLOSE_SESSION = "CloseSession" | ||
METADATA = "Metadata" | ||
CLOSE_OPERATION = "CloseOperation" | ||
GET_OPERATION_STATUS = "GetOperationStatus" | ||
CANCEL_OPERATION = "CancelOperation" | ||
EXECUTE_STATEMENT = "ExecuteStatement" | ||
FETCH_RESULTS = "FetchResults" | ||
CLOUD_FETCH = "CloudFetch" | ||
AUTH = "Auth" | ||
TELEMETRY_PUSH = "TelemetryPush" | ||
VOLUME_GET = "VolumeGet" | ||
VOLUME_PUT = "VolumePut" | ||
VOLUME_DELETE = "VolumeDelete" | ||
OTHER = "Other" | ||
|
||
@classmethod | ||
|
@@ -45,9 +55,66 @@ def get(cls, value: str): | |
if valid_command: | ||
return getattr(cls, str(valid_command)) | ||
else: | ||
# Map Thrift metadata operations to METADATA type | ||
metadata_operations = { | ||
"GetOperationStatus", "GetResultSetMetadata", "GetTables", | ||
"GetColumns", "GetSchemas", "GetCatalogs", "GetFunctions", | ||
"GetPrimaryKeys", "GetTypeInfo", "GetCrossReference", | ||
"GetImportedKeys", "GetExportedKeys", "GetTableTypes" | ||
} | ||
if value in metadata_operations: | ||
return cls.METADATA | ||
return cls.OTHER | ||
|
||
|
||
class CommandIdempotency(Enum): | ||
IDEMPOTENT = "idempotent" | ||
NON_IDEMPOTENT = "non_idempotent" | ||
|
||
|
||
# Mapping of CommandType to CommandIdempotency | ||
# Based on the official idempotency classification | ||
COMMAND_IDEMPOTENCY_MAP = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. link the retry design doc section here that classifies requests |
||
# NON-IDEMPOTENT operations (safety first - unknown types are not retried) | ||
CommandType.NOT_SET: CommandIdempotency.NON_IDEMPOTENT, | ||
CommandType.EXECUTE_STATEMENT: CommandIdempotency.NON_IDEMPOTENT, | ||
CommandType.FETCH_RESULTS: CommandIdempotency.NON_IDEMPOTENT, | ||
CommandType.VOLUME_PUT: CommandIdempotency.NON_IDEMPOTENT, # PUT can overwrite files | ||
|
||
# IDEMPOTENT operations | ||
CommandType.OPEN_SESSION: CommandIdempotency.IDEMPOTENT, | ||
CommandType.CLOSE_SESSION: CommandIdempotency.IDEMPOTENT, | ||
CommandType.METADATA: CommandIdempotency.IDEMPOTENT, | ||
CommandType.CLOSE_OPERATION: CommandIdempotency.IDEMPOTENT, | ||
CommandType.CANCEL_OPERATION: CommandIdempotency.IDEMPOTENT, | ||
CommandType.CLOUD_FETCH: CommandIdempotency.IDEMPOTENT, | ||
CommandType.AUTH: CommandIdempotency.IDEMPOTENT, | ||
CommandType.TELEMETRY_PUSH: CommandIdempotency.IDEMPOTENT, | ||
CommandType.VOLUME_GET: CommandIdempotency.IDEMPOTENT, | ||
CommandType.VOLUME_DELETE: CommandIdempotency.IDEMPOTENT, | ||
CommandType.OTHER: CommandIdempotency.IDEMPOTENT, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what comes under CommandType.OTHER? Please clarify |
||
} | ||
|
||
# HTTP status codes that should never be retried, even for idempotent requests | ||
# These are client error codes that indicate permanent issues | ||
NON_RETRYABLE_STATUS_CODES = { | ||
400, # Bad Request | ||
401, # Unauthorized | ||
403, # Forbidden | ||
404, # Not Found | ||
405, # Method Not Allowed | ||
409, # Conflict | ||
410, # Gone | ||
411, # Length Required | ||
412, # Precondition Failed | ||
413, # Payload Too Large | ||
414, # URI Too Long | ||
415, # Unsupported Media Type | ||
416, # Range Not Satisfiable | ||
501, # Not Implemented | ||
} | ||
|
||
|
||
class DatabricksRetryPolicy(Retry): | ||
""" | ||
Implements our v3 retry policy by extending urllib3's robust default retry behaviour. | ||
|
@@ -354,38 +421,25 @@ def should_retry(self, method: str, status_code: int) -> Tuple[bool, str]: | |
|
||
logger.info(f"Received status code {status_code} for {method} request") | ||
|
||
# Get command idempotency for use in multiple conditions below | ||
command_idempotency = COMMAND_IDEMPOTENCY_MAP.get( | ||
self.command_type, CommandIdempotency.NON_IDEMPOTENT | ||
) | ||
|
||
# Request succeeded. Don't retry. | ||
if status_code // 100 <= 3: | ||
return False, "2xx/3xx codes are not retried" | ||
|
||
if status_code == 400: | ||
return ( | ||
False, | ||
"Received 400 - BAD_REQUEST. Please check the request parameters.", | ||
) | ||
|
||
if status_code == 401: | ||
return ( | ||
False, | ||
"Received 401 - UNAUTHORIZED. Confirm your authentication credentials.", | ||
) | ||
|
||
if status_code == 403: | ||
return False, "403 codes are not retried" | ||
|
||
# Request failed and server said NotImplemented. This isn't recoverable. Don't retry. | ||
if status_code == 501: | ||
return False, "Received code 501 from server." | ||
|
||
# Request failed and this method is not retryable. We only retry POST requests. | ||
if not self._is_method_retryable(method): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where is this method defined? |
||
return False, "Only POST requests are retried" | ||
|
||
# Request failed with 404 and was a GetOperationStatus. This is not recoverable. Don't retry. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comment mentions Request was a GetOperationStatus but in if condition you're checking for all metadata command types. Is this intended? |
||
if status_code == 404 and self.command_type == CommandType.GET_OPERATION_STATUS: | ||
if status_code == 404 and self.command_type == CommandType.METADATA: | ||
return ( | ||
False, | ||
"GetOperationStatus received 404 code from Databricks. Operation was canceled.", | ||
"Metadata request received 404 code from Databricks. Operation was canceled.", | ||
) | ||
|
||
# Request failed with 404 because CloseSession returns 404 if you repeat the request. | ||
|
@@ -408,23 +462,26 @@ def should_retry(self, method: str, status_code: int) -> Tuple[bool, str]: | |
"CloseOperation received 404 code from Databricks. Cursor is already closed." | ||
) | ||
|
||
if status_code in NON_RETRYABLE_STATUS_CODES: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should return more descriptive error message, based on status_code. |
||
return False, f"Received {status_code} code from Databricks. Operation was canceled." | ||
|
||
# Request failed, was an ExecuteStatement and the command may have reached the server | ||
if ( | ||
self.command_type == CommandType.EXECUTE_STATEMENT | ||
command_idempotency == CommandIdempotency.NON_IDEMPOTENT | ||
and status_code not in self.status_forcelist | ||
and status_code not in self.force_dangerous_codes | ||
): | ||
return ( | ||
False, | ||
"ExecuteStatement command can only be retried for codes 429 and 503", | ||
"Non Idempotent requests can only be retried for codes 429 and 503", | ||
) | ||
|
||
# Request failed with a dangerous code, was an ExecuteStatement, but user forced retries for this | ||
# dangerous code. Note that these lines _are not required_ to make these requests retry. They would | ||
# retry automatically. This code is included only so that we can log the exact reason for the retry. | ||
# This gives users signal that their _retry_dangerous_codes setting actually did something. | ||
if ( | ||
self.command_type == CommandType.EXECUTE_STATEMENT | ||
command_idempotency == CommandIdempotency.NON_IDEMPOTENT | ||
and status_code in self.force_dangerous_codes | ||
): | ||
return ( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -218,11 +218,37 @@ def _prepare_headers( | |
def _prepare_retry_policy(self): | ||
"""Set up the retry policy for the current request.""" | ||
if isinstance(self._retry_policy, DatabricksRetryPolicy): | ||
# Set command type for HTTP requests to OTHER (not database commands) | ||
self._retry_policy.command_type = CommandType.OTHER | ||
# Only set command type to NOT_SET if it hasn't been explicitly set via setRequestType() | ||
if self._retry_policy.command_type is None: | ||
self._retry_policy.command_type = CommandType.NOT_SET | ||
# Start the retry timer for duration-based retry limits | ||
self._retry_policy.start_retry_timer() | ||
|
||
def setRequestType(self, request_type: CommandType): | ||
""" | ||
Set the specific request type for the next HTTP request. | ||
|
||
This allows clients to specify what type of operation they're performing | ||
so the retry policy can make appropriate idempotency decisions. | ||
|
||
Args: | ||
request_type: The CommandType enum value for this operation | ||
|
||
Example: | ||
# For authentication requests (OAuth, etc.) | ||
http_client.setRequestType(CommandType.AUTH) | ||
response = http_client.request(HttpMethod.POST, url, body=data) | ||
|
||
# For cloud fetch operations | ||
http_client.setRequestType(CommandType.CLOUD_FETCH) | ||
response = http_client.request(HttpMethod.GET, cloud_url) | ||
""" | ||
if isinstance(self._retry_policy, DatabricksRetryPolicy): | ||
self._retry_policy.command_type = request_type | ||
logger.debug(f"Set request type to: {request_type.value}") | ||
else: | ||
logger.warning(f"Cannot set request type {request_type.value}: retry policy is not DatabricksRetryPolicy") | ||
|
||
@contextmanager | ||
def request_context( | ||
self, | ||
|
@@ -269,6 +295,11 @@ def request_context( | |
logger.error("HTTP request error: %s", e) | ||
raise RequestError(f"HTTP request error: {e}") | ||
finally: | ||
# Reset command type after request completion to prevent it from affecting subsequent requests | ||
if isinstance(self._retry_policy, DatabricksRetryPolicy): | ||
self._retry_policy.command_type = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn't setting command_type to NOT_SET better? it will be type safe |
||
logger.debug("Reset command type after request completion") | ||
|
||
if response: | ||
response.close() | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,7 +33,7 @@ def fetch_rows(self, cursor, row_count, fetchmany_size): | |
rows = self.get_some_rows(cursor, fetchmany_size) | ||
if not rows: | ||
# Read all the rows, row_count should match | ||
self.assertEqual(n, row_count) | ||
assert n == row_count | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this change related to this PR |
||
|
||
num_fetches = max(math.ceil(n / 10000), 1) | ||
latency_ms = int((time.time() - start_time) * 1000 / num_fetches), 1 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -278,7 +278,7 @@ def test_retry_max_count_not_exceeded(self, mock_send_telemetry, extra_params): | |
THEN the connector issues six request (original plus five retries) | ||
before raising an exception | ||
""" | ||
with mocked_server_response(status=404) as mock_obj: | ||
with mocked_server_response(status=429) as mock_obj: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice catch! |
||
with pytest.raises(MaxRetryError) as cm: | ||
extra_params = {**extra_params, **self._retry_policy} | ||
with self.connection(extra_params=extra_params) as conn: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This again is imposing thread safety concerns. we need to figure out a way to avoid having a state in
httpclient
.