-
Notifications
You must be signed in to change notification settings - Fork 10
fix(integrations): skip unconfigured + dedupe health-check alerts #207
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
beb95a0
0204e27
2f7eeec
21bacf1
ec68061
dae5bae
8b0fc58
0654b80
c26a8fd
a8cea54
6643024
2f3df4d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,15 @@ class Alert_Manager { | |
| */ | ||
| const FAILURE_LOG_OPTION = 'newspack_alert_failure_log'; | ||
|
|
||
| /** | ||
| * Window during which a repeating health-check failure for the same | ||
| * integration + error-code signature emits at most one Slack alert. | ||
| * | ||
| * Private because no external caller needs to read it; keeping the | ||
| * surface minimal lets the value evolve without breaking consumers. | ||
| */ | ||
| private const HEALTH_CHECK_DEDUP_INTERVAL = DAY_IN_SECONDS; | ||
|
|
||
| /** | ||
| * Default pattern rules. | ||
| * Each rule defines a grouping dimension, threshold, and time interval. | ||
|
|
@@ -454,14 +463,52 @@ private static function format_interval( $seconds ) { | |
| /** | ||
| * Handle integration health check failure. | ||
| * | ||
| * Deduplicates by integration + error-code + error-message signature | ||
| * for HEALTH_CHECK_DEDUP_INTERVAL so an hourly cron does not repeat | ||
| * the same Slack alert all day. A new error code OR a changed message | ||
| * on the same integration (e.g. "list missing" escalating to "auth | ||
| * fully revoked") falls outside the key and alerts immediately. | ||
| * | ||
| * Known boundaries of the dedup contract: | ||
| * - Message text is part of the key, so locale shifts between cron | ||
| * passes (e.g. `switch_to_locale()` in a multilingual context) can | ||
| * produce a different key for the same underlying error and | ||
| * re-alert. Newspack ESP error messages are static per code today, | ||
| * so this is theoretical; revisit if dynamic content lands in | ||
| * error strings. | ||
| * - The dedup key is stored as a transient, so on hosts backed by a | ||
| * persistent object cache (memcached) the entry can be evicted | ||
| * under LRU pressure before HEALTH_CHECK_DEDUP_INTERVAL elapses. | ||
| * The failure mode is re-alerting on the next hourly cron — the | ||
| * alternative (writing to the options table on every cron tick) | ||
| * has its own cost; transient + accepted re-alert risk is the | ||
| * intentional trade-off here. | ||
| * | ||
| * @param array $payload Health check failure data. | ||
| */ | ||
| public static function handle_health_check_failed( $payload ) { | ||
| $error = $payload['error'] ?? null; | ||
| $error = $payload['error'] ?? null; | ||
| $integration_id = $payload['integration_id'] ?? 'unknown'; | ||
| $error_codes = is_wp_error( $error ) ? $error->get_error_codes() : []; | ||
| if ( empty( $error_codes ) ) { | ||
| $error_codes = [ 'unknown' ]; | ||
| } | ||
| $error_messages = is_wp_error( $error ) ? $error->get_error_messages() : []; | ||
|
|
||
| $dedup_key = self::get_health_check_dedup_key( $integration_id, $error_codes, $error_messages ); | ||
|
Comment on lines
+466
to
+498
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right — PR description updated. The 'Changes proposed' section now states the dedup key is |
||
| if ( get_transient( $dedup_key ) ) { | ||
| return; | ||
| } | ||
|
|
||
| // Set the dedup transient BEFORE dispatch so a `newspack_alert` | ||
| // handler that throws (e.g. transient Slack POST failure) cannot | ||
| // defeat dedup by leaving the key unset for the next hourly cron. | ||
| set_transient( $dedup_key, time(), self::HEALTH_CHECK_DEDUP_INTERVAL ); | ||
|
|
||
| $message = sprintf( | ||
| 'Integration "%s" health check failed: %s', | ||
| $payload['integration_name'] ?? 'unknown', | ||
| is_wp_error( $error ) ? implode( '; ', $error->get_error_messages() ) : 'unknown error' | ||
| is_wp_error( $error ) ? implode( '; ', $error_messages ) : 'unknown error' | ||
| ); | ||
|
|
||
| /** This action is documented in includes/class-alert-manager.php */ | ||
|
|
@@ -476,5 +523,22 @@ public static function handle_health_check_failed( $payload ) { | |
| ] | ||
| ); | ||
| } | ||
|
|
||
| /** | ||
| * Get the deduplication transient key for a health-check failure. | ||
| * | ||
| * @param string $integration_id The integration identifier. | ||
| * @param string[] $error_codes The WP_Error codes from the failure. | ||
| * @param string[] $error_messages The WP_Error messages from the failure. | ||
| * | ||
| * @return string Transient key. | ||
| */ | ||
| private static function get_health_check_dedup_key( $integration_id, $error_codes, $error_messages = [] ) { | ||
| $codes = array_map( 'strval', $error_codes ); | ||
| sort( $codes ); | ||
| $messages = array_map( 'strval', $error_messages ); | ||
| sort( $messages ); | ||
| return 'newspack_alert_hc_' . md5( $integration_id . ':' . implode( ',', $codes ) . ':' . implode( '|', $messages ) ); | ||
| } | ||
| } | ||
| Alert_Manager::init(); | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -121,7 +121,7 @@ public static function is_stale( $timestamp ) { | |
| */ | ||
| public static function pull_sync( $integrations = [] ) { | ||
| if ( empty( $integrations ) ) { | ||
| $integrations = Integrations::get_active_integrations(); | ||
| $integrations = Integrations::get_active_configured_integrations(); | ||
| } | ||
|
|
||
| Logger::log( 'Synchronous pull started for user "' . get_current_user_id() . '".', self::LOGGER_HEADER ); | ||
|
|
@@ -158,7 +158,7 @@ public static function pull_sync( $integrations = [] ) { | |
| * @return true|\WP_Error True if all succeeded, WP_Error with combined messages. | ||
| */ | ||
| public static function pull_all( $user_id ) { | ||
| $active_integrations = Integrations::get_active_integrations(); | ||
| $active_integrations = Integrations::get_active_configured_integrations(); | ||
| $errors = []; | ||
|
Comment on lines
160
to
162
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch — Fixed in a8cea54. |
||
|
|
||
| foreach ( $active_integrations as $integration ) { | ||
|
|
@@ -225,6 +225,14 @@ public static function handle_ajax_pull() { | |
| wp_send_json_error( 'Integration not found or not enabled.', 404 ); | ||
| } | ||
|
|
||
| // Defense-in-depth: pull_sync already filters to set-up integrations, | ||
| // but a direct AJAX call could still arrive here for an unconfigured | ||
| // integration. Skip silently with success — "not set up" is a no-op, | ||
| // not an error worth surfacing to the loopback caller. | ||
| if ( ! $integration->is_set_up() ) { | ||
| wp_send_json_success(); | ||
| } | ||
|
|
||
| $user_id = get_current_user_id(); | ||
| if ( ! $user_id ) { | ||
| wp_send_json_error( 'No user context.', 403 ); | ||
|
|
@@ -418,6 +426,11 @@ public static function execute_integration_retry( $retry_data ) { | |
| return; | ||
| } | ||
|
|
||
| if ( ! $integration->is_set_up() ) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Same retry-chain abort on the pull side Mirrors the sync retry guard: a transient
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same root cause. Fixed in dae5bae — the stored-state |
||
| Logger::log( sprintf( 'Integration "%s" no longer set up on pull retry %d; aborting retry chain.', $integration_id, $retry_count ), self::LOGGER_HEADER ); | ||
| return; | ||
| } | ||
|
|
||
| Logger::log( sprintf( 'Executing pull retry %d/%d for integration "%s" of user %d.', $retry_count, self::MAX_RETRIES, $integration_id, $user_id ), self::LOGGER_HEADER ); | ||
|
|
||
| $result = self::pull_single_integration( $user_id, $integration ); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -162,7 +162,7 @@ private static function push_to_integrations( $contact, $context, $existing_cont | |
| * @param string $context The context of the sync. | ||
| */ | ||
| $contact = \apply_filters( 'newspack_esp_sync_contact', $contact, $context ); | ||
| $integrations = Integrations::get_active_integrations(); | ||
| $integrations = Integrations::get_active_configured_integrations(); | ||
| $errors = []; | ||
|
|
||
| // Resolve user ID for retry scheduling. | ||
|
|
@@ -372,6 +372,11 @@ public static function execute_integration_retry( $retry_data ) { | |
| return; | ||
| } | ||
|
|
||
| if ( ! $integration->is_set_up() ) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Aborts an in-flight retry chain on a transient probe failure Same root cause as the push-path comment above. A retry that was queued because of a real prior failure hits this guard at execution time; if the live
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same root cause as the push-path comment. Fixed in dae5bae — |
||
| static::log( sprintf( 'Integration "%s" no longer set up on retry %d; aborting retry chain.', $integration_id, $retry_count ) ); | ||
| return; | ||
| } | ||
|
|
||
| static::log( sprintf( 'Executing retry %d/%d for integration "%s" sync of user %d (%s).', $retry_count, self::MAX_RETRIES, $integration_id, $user_id, $contact['email'] ?? 'unknown' ) ); | ||
|
|
||
| /** This filter is documented in includes/reader-activation/sync/class-contact-sync.php */ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 Dedup keys on error codes only, not messages
An escalating failure that keeps the same code set but carries a worse message (e.g. "list missing" → "auth fully revoked") is suppressed for the full 24h window, so the richer signal never reaches Slack. Worth confirming that's acceptable, or fold a short message hash into the key.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 0654b80 — the dedup key now folds
WP_Error::get_error_messages()into the md5 alongside the codes. An escalating same-code failure with a worse message now bypasses the bucket and re-alerts. Addedtest_health_check_failed_alerts_on_new_error_messages. Trade-off noted in the docblock for any future caller passing dynamic content (timestamps/IDs) in the message — for the current Newspack ESP errors, messages are static per code so dedup remains stable.