fix queue-processor rbn draining by default (#478)

bwagner5 · web-flow · commit 5a844ac1482c · 2021-08-17T14:12:51.000-05:00
* fix queue-processor rbn draining by default

* docs and cli docs

* fix rebalance sqs test to check for eviction

* fix rbn sqs test
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ The aws-node-termination-handler (NTH) can operate in two different modes: Insta
 
 The aws-node-termination-handler **[Instance Metadata Service](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html) Monitor** will run a small pod on each host to perform monitoring of IMDS paths like `/spot` or `/events` and react accordingly to drain and/or cordon the corresponding node.
 
-The aws-node-termination-handler **Queue Processor** will monitor an SQS queue of events from Amazon EventBridge for ASG lifecycle events, EC2 status change events, and Spot Interruption Termination Notice events. When NTH detects an instance is going down, we use the Kubernetes API to cordon the node to ensure no new work is scheduled there, then drain it, removing any existing work. The termination handler **Queue Processor** requires AWS IAM permissions to monitor and manage the SQS queue and to query the EC2 API.
+The aws-node-termination-handler **Queue Processor** will monitor an SQS queue of events from Amazon EventBridge for ASG lifecycle events, EC2 status change events, Spot Interruption Termination Notice events, and Spot Rebalance Recommendation events. When NTH detects an instance is going down, we use the Kubernetes API to cordon the node to ensure no new work is scheduled there, then drain it, removing any existing work. The termination handler **Queue Processor** requires AWS IAM permissions to monitor and manage the SQS queue and to query the EC2 API.
 
 You can run the termination handler on any Kubernetes cluster running on AWS, including self-managed clusters and those created with Amazon [Elastic Kubernetes Service](https://docs.aws.amazon.com/eks/latest/userguide/what-is-eks.html).
 
@@ -80,9 +80,11 @@ IMDS Processor Mode allows for a fine-grained configuration of IMDS paths that a
  - `enableRebalanceMonitoring`
  - `enableScheduledEventDraining`
 
+By default, IMDS mode will only Cordon in response to a Rebalance Recommendation event (all other events are Cordoned and Drained). Cordon is the default for a rebalance event because it's not known if an ASG is being utilized and if that ASG is configured to replace the instance on a rebalance event. If you are using an ASG w/ rebalance recommendations enabled, then you can set the `enableRebalanceDraining` flag to true to perform a Cordon and Drain when a rebalance event is received.
+
 The `enableSqsTerminationDraining` must be set to false for these configuration values to be considered.
 
-The Queue Processor Mode does not allow for fine-grained configuration of which events are handled through helm configuration keys. Instead, you can modify your Amazon EventBridge rules to not send certain types of events to the SQS Queue so that NTH does not process those events.
+The Queue Processor Mode does not allow for fine-grained configuration of which events are handled through helm configuration keys. Instead, you can modify your Amazon EventBridge rules to not send certain types of events to the SQS Queue so that NTH does not process those events. All events when operating in Queue Processor mode are Cordoned and Drained unless the `cordon-only` flag is set to true.
 
 
 The `enableSqsTerminationDraining` flag turns on Queue Processor Mode. When Queue Processor Mode is enabled, IMDS mode cannot be active. NTH cannot respond to queue events AND monitor IMDS paths. Queue Processor Mode still queries for node information on startup, but this information is not required for normal operation, so it is safe to disable IMDS for the NTH pod.
diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go
@@ -318,35 +318,35 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto
 		runPreDrainTask(node, nodeName, drainEvent, metrics, recorder)
 	}
 
-        podNameList, err := node.FetchPodNameList(nodeName)
-        if err != nil {
-                log.Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName)
-        }
-        drainEvent.Pods = podNameList
-        err = node.LogPods(podNameList, nodeName)
-        if err != nil {
-                log.Err(err).Msg("There was a problem while trying to log all pod names on the node")
-        }
-
-	if nthConfig.CordonOnly || (drainEvent.IsRebalanceRecommendation() && !nthConfig.EnableRebalanceDraining) {
+	podNameList, err := node.FetchPodNameList(nodeName)
+	if err != nil {
+		log.Err(err).Msgf("Unable to fetch running pods for node '%s' ", nodeName)
+	}
+	drainEvent.Pods = podNameList
+	err = node.LogPods(podNameList, nodeName)
+	if err != nil {
+		log.Err(err).Msg("There was a problem while trying to log all pod names on the node")
+	}
+
+	if nthConfig.CordonOnly || (!nthConfig.EnableSQSTerminationDraining && drainEvent.IsRebalanceRecommendation() && !nthConfig.EnableRebalanceDraining) {
 		err = cordonNode(node, nodeName, drainEvent, metrics, recorder)
 	} else {
 		err = cordonAndDrainNode(node, nodeName, metrics, recorder, nthConfig.EnableSQSTerminationDraining)
 	}
-        
+
 	if nthConfig.WebhookURL != "" {
 		webhook.Post(nodeMetadata, drainEvent, nthConfig)
 	}
 
-        if err != nil {
-                <-interruptionEventStore.Workers
-        } else {
-	        interruptionEventStore.MarkAllAsProcessed(nodeName)
-	        if drainEvent.PostDrainTask != nil {
-		        runPostDrainTask(node, nodeName, drainEvent, metrics, recorder)
-	        }
-	        <-interruptionEventStore.Workers
-        }
+	if err != nil {
+		<-interruptionEventStore.Workers
+	} else {
+		interruptionEventStore.MarkAllAsProcessed(nodeName)
+		if drainEvent.PostDrainTask != nil {
+			runPostDrainTask(node, nodeName, drainEvent, metrics, recorder)
+		}
+		<-interruptionEventStore.Workers
+	}
 
 }
 
diff --git a/config/helm/aws-node-termination-handler/README.md b/config/helm/aws-node-termination-handler/README.md
@@ -92,7 +92,7 @@ Parameter | Description | Default
 
 Parameter | Description | Default
 --- | --- | ---
-`enableSqsTerminationDraining` | If true, this turns on queue-processor mode which drains nodes when an SQS termination event is received| `false`
+`enableSqsTerminationDraining` | If true, this turns on queue-processor mode which drains nodes when an SQS termination event is received. | `false`
 `queueURL` | Listens for messages on the specified SQS queue URL | None
 `awsRegion` | If specified, use the AWS region for AWS API calls, else NTH will try to find the region through AWS_REGION env var, IMDS, or the specified queue URL | ``
 `checkASGTagBeforeDraining` | If true, check that the instance is tagged with "aws-node-termination-handler/managed" as the key before draining the node | `true`
@@ -107,8 +107,8 @@ Parameter | Description | Default
 --- | --- | ---
 `enableScheduledEventDraining` | [EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event | `false`
 `enableSpotInterruptionDraining` | If true, drain nodes when the spot interruption termination notice is received | `true`
-`enableRebalanceMonitoring` | If true, cordon nodes when the rebalance recommendation notice is received | `false`
 `enableRebalanceDraining` | If true, drain nodes when the rebalance recommendation notice is received | `false`
+`enableRebalanceMonitoring` | If true, cordon nodes when the rebalance recommendation notice is received. If you'd like to drain the node in addition to cordoning, then also set `enableRebalanceDraining`. | `false`
 `useHostNetwork` | If `true`, enables `hostNetwork` for the Linux DaemonSet. NOTE: setting this to `false` may cause issues accessing IMDSv2 if your account is not configured with an IP hop count of 2 | `true`
 
 ### Kubernetes Configuration
diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -165,7 +165,7 @@ func ParseCliArgs() (config Config, err error) {
 	flag.BoolVar(&config.EnableScheduledEventDraining, "enable-scheduled-event-draining", getBoolEnv(enableScheduledEventDrainingConfigKey, enableScheduledEventDrainingDefault), "[EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event")
 	flag.BoolVar(&config.EnableSpotInterruptionDraining, "enable-spot-interruption-draining", getBoolEnv(enableSpotInterruptionDrainingConfigKey, enableSpotInterruptionDrainingDefault), "If true, drain nodes when the spot interruption termination notice is received")
 	flag.BoolVar(&config.EnableSQSTerminationDraining, "enable-sqs-termination-draining", getBoolEnv(enableSQSTerminationDrainingConfigKey, enableSQSTerminationDrainingDefault), "If true, drain nodes when an SQS termination event is received")
-	flag.BoolVar(&config.EnableRebalanceMonitoring, "enable-rebalance-monitoring", getBoolEnv(enableRebalanceMonitoringConfigKey, enableRebalanceMonitoringDefault), "If true, cordon nodes when the rebalance recommendation notice is received")
+	flag.BoolVar(&config.EnableRebalanceMonitoring, "enable-rebalance-monitoring", getBoolEnv(enableRebalanceMonitoringConfigKey, enableRebalanceMonitoringDefault), "If true, cordon nodes when the rebalance recommendation notice is received. If you'd like to drain the node in addition to cordoning, then also set \"enableRebalanceDraining\".")
 	flag.BoolVar(&config.EnableRebalanceDraining, "enable-rebalance-draining", getBoolEnv(enableRebalanceDrainingConfigKey, enableRebalanceDrainingDefault), "If true, drain nodes when the rebalance recommendation notice is received")
 	flag.BoolVar(&config.CheckASGTagBeforeDraining, "check-asg-tag-before-draining", getBoolEnv(checkASGTagBeforeDrainingConfigKey, checkASGTagBeforeDrainingDefault), "If true, check that the instance is tagged with \"aws-node-termination-handler/managed\" as the key before draining the node")
 	flag.StringVar(&config.ManagedAsgTag, "managed-asg-tag", getEnv(managedAsgTagConfigKey, managedAsgTagDefault), "Sets the tag to check for on instances that is propogated from the ASG before taking action, default to aws-node-termination-handler/managed")
diff --git a/test/e2e/rebalance-recommendation-sqs-test b/test/e2e/rebalance-recommendation-sqs-test
@@ -153,7 +153,7 @@ GET_ATTRS_SQS_CMD="awslocal sqs get-queue-attributes --queue-url ${queue_url} --
 
 cordoned=0
 tainted=0
-not_evicted=0
+evicted=0
 message_deleted=0
 test_node="${TEST_NODE:-$CLUSTER_NAME-worker}"
 for i in $(seq 1 $TAINT_CHECK_CYCLES); do
@@ -167,16 +167,17 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
       tainted=1
     fi
 
-    if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
-        echo "✅ Verified the regular-pod-test pod was NOT evicted!"
-        not_evicted=1
+    if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
+        echo "✅ Verified the regular-pod-test pod was evicted!"
+        evicted=1
     fi
 
-    if [[ ${tainted} -eq 1 && $(kubectl exec -i "${localstack_pod}" -- bash -c "${GET_ATTRS_SQS_CMD}" | jq '(.Attributes.ApproximateNumberOfMessagesNotVisible|tonumber) + (.Attributes.ApproximateNumberOfMessages|tonumber)' ) -eq 0 ]]; then
+    if [[ ${evicted} -eq 1 && $(kubectl exec -i "${localstack_pod}" -- bash -c "${GET_ATTRS_SQS_CMD}" | jq '(.Attributes.ApproximateNumberOfMessagesNotVisible|tonumber) + (.Attributes.ApproximateNumberOfMessages|tonumber)' ) -eq 0 ]]; then
         kubectl exec -i "${localstack_pod}" -- bash -c "${GET_ATTRS_SQS_CMD}"
         echo "✅ Verified the message was deleted from the queue after processing!"
         message_deleted=1
-        break
+        echo "✅ Rebalance Recommendation SQS Test Passed $CLUSTER_NAME! ✅"
+        exit 0
     fi
 
     echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
@@ -189,59 +190,12 @@ if [[ $cordoned -eq 0 ]]; then
 elif [[ $tainted -eq 0 ]]; then
   echo "❌ Worker node was not tainted"
   fail_and_exit 3
-elif [[ $not_evicted -eq 0 ]]; then
-  echo "❌ regular-pod-test was evicted"
+elif [[ $evicted -eq 0 ]]; then
+  echo "❌ regular-pod-test was NOT evicted"
   fail_and_exit 3
 elif [[ $message_deleted -eq 0 ]]; then
   echo "❌ message was not removed from the queue after processing"
   fail_and_exit 3
 fi
 
-# Ensure pod is evicted following a spot itn
-SPOT_EVENT=$(cat <<EOF
-{
-  "version": "0",
-  "id": "1e5527d7-bb36-4607-3370-4164db56a40e",
-  "detail-type": "EC2 Spot Instance Interruption Warning",
-  "source": "aws.ec2",
-  "account": "123456789012",
-  "time": "$(date -u +"%Y-%m-%dT%TZ")",
-  "region": "us-east-1",
-  "resources": [
-    "arn:aws:ec2:us-east-1b:instance/${instance_id}"
-  ],
-  "detail": {
-    "instance-id": "${instance_id}",
-    "instance-action": "terminate"
-  }
-}
-EOF
-)
-
-SPOT_EVENT_ONE_LINE=$(echo "${SPOT_EVENT}" | tr -d '\n' |sed 's/\"/\\"/g')
-SEND_SQS_CMD="awslocal sqs send-message --queue-url ${queue_url} --message-body \"${SPOT_EVENT_ONE_LINE}\" --region ${AWS_REGION}"
-kubectl exec -i "${localstack_pod}" -- bash -c "${SEND_SQS_CMD}"
-echo "✅ Sent Spot Interruption Event to SQS queue: ${queue_url}"
-GET_ATTRS_SQS_CMD="awslocal sqs get-queue-attributes --queue-url ${queue_url} --attribute-names All --region ${AWS_REGION}"
-
-echo "🥑 Waiting for Spot ITN..."
-evicted=0
-for i in $(seq 1 $TAINT_CHECK_CYCLES); do
-    if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
-        echo "✅ Verified the regular-pod-test pod was evicted!"
-        evicted=1
-    fi
-
-    if [[ ${evicted} -eq 1 && $(kubectl exec -i "${localstack_pod}" -- bash -c "${GET_ATTRS_SQS_CMD}" | jq '(.Attributes.ApproximateNumberOfMessagesNotVisible|tonumber) + (.Attributes.ApproximateNumberOfMessages|tonumber)' ) -eq 0 ]]; then
-        kubectl exec -i "${localstack_pod}" -- bash -c "${GET_ATTRS_SQS_CMD}"
-        echo "✅ Verified the message was deleted from the queue after processing!"
-        echo "✅ Rebalance Recommendation SQS Test Passed $CLUSTER_NAME! ✅"
-        exit 0
-    fi
-
-    echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
-    sleep $TAINT_CHECK_SLEEP
-done
-
-echo "❌ Rebalance Recommendation SQS Test Failed $CLUSTER_NAME ❌"
 fail_and_exit 1