From efc64f0e2bfb3b536caf6da68f3ac208cf02a2d8 Mon Sep 17 00:00:00 2001 From: Theodore Kolkman Date: Thu, 24 Nov 2022 16:24:05 -0600 Subject: [PATCH 1/8] Added script to check if Borealis is down --- telemetry/alerts/borealis_down | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100755 telemetry/alerts/borealis_down diff --git a/telemetry/alerts/borealis_down b/telemetry/alerts/borealis_down new file mode 100755 index 0000000..aee72bb --- /dev/null +++ b/telemetry/alerts/borealis_down @@ -0,0 +1,52 @@ +#!/bin/bash +# Copyright 2022 SuperDARN Canada, University of Saskatchewan +# Author: Theodore Kolkman +# +# Script that pings the Borealis computer, and sends a text alert if the computer is unreachable. +# This script should be run via site CD or DDS computer, so that text alerts are only sent if only +# Borealis is unreachable, but CD or DDS are still online. If the whole site is down, there is +# little we can do, and we don't want to be notified. +# +# Dependencies: +# - mutt (Installed with zypper) +# - postfix enabled and started +# - BOREALIS_IP defined in ~/.bashrc +# - Email-to-text addresses defined correctly +# - Line 'myorigin = smtp.servername.suffix' must be added to /etc/postfix/main.cf to stop cell +# providers from blocking the computer sending via mutt + +################################################################################################### + +source "${HOME}/.bashrc" + +# Define text numbers to alert here +# Correct cell provider email-to-sms addresses must be used too +ADDRESSES=(3062313083@sms.sasktel.com) + +SERVER_DOWN="false" + +MESSAGE="SuperDARN Borealis Alert:\n\n${RADARID}borealis is down" +echo -e $MESSAGE + +################################################################################################### + +printf "Executing $0 on $(hostname) for ${RADARID}\n" +date --utc "+%Y%m%d %H:%M:%S UTC" + +# If ping fails, Borealis is unreachable and return value will be 1 +ping -c1 -W1 $BOREALIS_IP +if [[ $? -ne 0 ]]; then + echo 'Borealis is down' + SERVER_DOWN="true" +fi + +# Send text alert if Borealis unreachable. +if [[ $SERVER_DOWN == "true" ]]; then + echo "Sending text alerts to:" + printf '%s\n' "${ADDRESSES[@]}" + for address in $ADDRESSES; do + echo -e $MESSAGE | mutt -- $address + done +fi +echo "" + From 19bf0f73f62591607b920b6c137047db6c0a4d6c Mon Sep 17 00:00:00 2001 From: Theodore Kolkman Date: Thu, 24 Nov 2022 16:42:01 -0600 Subject: [PATCH 2/8] Added flag to ensure only a single message is sent --- telemetry/alerts/borealis_down | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/telemetry/alerts/borealis_down b/telemetry/alerts/borealis_down index aee72bb..77ed068 100755 --- a/telemetry/alerts/borealis_down +++ b/telemetry/alerts/borealis_down @@ -24,6 +24,7 @@ source "${HOME}/.bashrc" ADDRESSES=(3062313083@sms.sasktel.com) SERVER_DOWN="false" +MESSAGE_SENT="/tmp/.message_sent_flag" MESSAGE="SuperDARN Borealis Alert:\n\n${RADARID}borealis is down" echo -e $MESSAGE @@ -36,17 +37,20 @@ date --utc "+%Y%m%d %H:%M:%S UTC" # If ping fails, Borealis is unreachable and return value will be 1 ping -c1 -W1 $BOREALIS_IP if [[ $? -ne 0 ]]; then + rm -f $MESSAGE_SENT +else echo 'Borealis is down' SERVER_DOWN="true" fi -# Send text alert if Borealis unreachable. -if [[ $SERVER_DOWN == "true" ]]; then +# Send text alert if Borealis unreachable, and an alert hasn't already been sent +if [[ $SERVER_DOWN == "true" ]] && [[ ! -e $MESSAGE_SENT ]]; then echo "Sending text alerts to:" printf '%s\n' "${ADDRESSES[@]}" for address in $ADDRESSES; do echo -e $MESSAGE | mutt -- $address done + touch $MESSAGE_SENT fi echo "" From f02fd0bf032c598ae2015dd82e69c683d288e286 Mon Sep 17 00:00:00 2001 From: Theodore Kolkman Date: Mon, 5 Dec 2022 15:35:57 -0600 Subject: [PATCH 3/8] Removed testing configuration, added to text alert message --- telemetry/alerts/borealis_down | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/telemetry/alerts/borealis_down b/telemetry/alerts/borealis_down index 77ed068..bb0a16f 100755 --- a/telemetry/alerts/borealis_down +++ b/telemetry/alerts/borealis_down @@ -26,8 +26,7 @@ ADDRESSES=(3062313083@sms.sasktel.com) SERVER_DOWN="false" MESSAGE_SENT="/tmp/.message_sent_flag" -MESSAGE="SuperDARN Borealis Alert:\n\n${RADARID}borealis is down" -echo -e $MESSAGE +MESSAGE="SuperDARN Borealis Alert:\n\n${RADARID}borealis is down\n\nSent from $(hostname)" ################################################################################################### @@ -36,7 +35,7 @@ date --utc "+%Y%m%d %H:%M:%S UTC" # If ping fails, Borealis is unreachable and return value will be 1 ping -c1 -W1 $BOREALIS_IP -if [[ $? -ne 0 ]]; then +if [[ $? -eq 0 ]]; then rm -f $MESSAGE_SENT else echo 'Borealis is down' From 8515433d4f65470581b4838f472ddd64c043d578 Mon Sep 17 00:00:00 2001 From: Theodore Kolkman Date: Mon, 12 Dec 2022 13:26:43 -0600 Subject: [PATCH 4/8] Modified borealis_down to operate as daemon Added checks to only send texts after ping fails 10 times --- telemetry/alerts/borealis_down | 65 ++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/telemetry/alerts/borealis_down b/telemetry/alerts/borealis_down index bb0a16f..5fcd5ed 100755 --- a/telemetry/alerts/borealis_down +++ b/telemetry/alerts/borealis_down @@ -14,6 +14,19 @@ # - Email-to-text addresses defined correctly # - Line 'myorigin = smtp.servername.suffix' must be added to /etc/postfix/main.cf to stop cell # providers from blocking the computer sending via mutt +# +# This script should be ran as a systemd daemon. An example .service file is shown below: +# [Unit] +# Description=Borealis down alert daemon +# +# [Service] +# User=transfer +# ExecStart=/home/transfer/edtk/telemetry/alerts/borealis_down +# Restart=always +# +# [Install] +# WantedBy=multi-user.target + ################################################################################################### @@ -23,33 +36,41 @@ source "${HOME}/.bashrc" # Correct cell provider email-to-sms addresses must be used too ADDRESSES=(3062313083@sms.sasktel.com) -SERVER_DOWN="false" -MESSAGE_SENT="/tmp/.message_sent_flag" +MESSAGE_SENT="false" # Flag to signal a text message has been sent +PING_FAIL_COUNT=0 # Number of times ping has failed to reach borealis computer +THRESHOLD=10 # Number of times ping must fail before message is sent +SLEEP_TIME=5 # How long program waits between pings MESSAGE="SuperDARN Borealis Alert:\n\n${RADARID}borealis is down\n\nSent from $(hostname)" ################################################################################################### -printf "Executing $0 on $(hostname) for ${RADARID}\n" +printf "\nDaemon $0 starting on $(hostname) for ${RADARID}\n" date --utc "+%Y%m%d %H:%M:%S UTC" +printf "\n\n" -# If ping fails, Borealis is unreachable and return value will be 1 -ping -c1 -W1 $BOREALIS_IP -if [[ $? -eq 0 ]]; then - rm -f $MESSAGE_SENT -else - echo 'Borealis is down' - SERVER_DOWN="true" -fi - -# Send text alert if Borealis unreachable, and an alert hasn't already been sent -if [[ $SERVER_DOWN == "true" ]] && [[ ! -e $MESSAGE_SENT ]]; then - echo "Sending text alerts to:" - printf '%s\n' "${ADDRESSES[@]}" - for address in $ADDRESSES; do - echo -e $MESSAGE | mutt -- $address - done - touch $MESSAGE_SENT -fi -echo "" +while true; do + # If ping fails, Borealis is unreachable and return value will be 1 + ping -c1 -W1 192.155.242.156 # $BOREALIS_IP + if [[ $? -eq 0 ]]; then + MESSAGE_SENT="false" + PING_FAIL_COUNT=0 + else + PING_FAIL_COUNT=$((PING_FAIL_COUNT + 1)) + echo "$(date --utc "+%Y%m%d %H:%M:%S") Borealis is down, ping number $PING_FAIL_COUNT" + fi + # Send text alert if Borealis unreachable, and an alert hasn't already been sent + if [[ $PING_FAIL_COUNT -ge $THRESHOLD ]] && [[ $MESSAGE_SENT == "false" ]]; then + date --utc "+%Y%m%d %H:%M:%S UTC" + echo "Sending text alerts to:" + printf '%s\n' "${ADDRESSES[@]}" + for address in $ADDRESSES; do + echo -e $MESSAGE | mutt -- $address + done + MESSAGE_SENT="true" + PING_FAIL_COUNT=0 + fi + sleep $SLEEP_TIME + printf "\n" +done \ No newline at end of file From 2f25bbbc3b426903d517a991c2dab9fbeb90ead8 Mon Sep 17 00:00:00 2001 From: Theodore Kolkman Date: Mon, 12 Dec 2022 13:34:04 -0600 Subject: [PATCH 5/8] Fixed ping'd IP, altered frequency of pings --- telemetry/alerts/borealis_down | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/telemetry/alerts/borealis_down b/telemetry/alerts/borealis_down index 5fcd5ed..8cb1d17 100755 --- a/telemetry/alerts/borealis_down +++ b/telemetry/alerts/borealis_down @@ -39,7 +39,7 @@ ADDRESSES=(3062313083@sms.sasktel.com) MESSAGE_SENT="false" # Flag to signal a text message has been sent PING_FAIL_COUNT=0 # Number of times ping has failed to reach borealis computer THRESHOLD=10 # Number of times ping must fail before message is sent -SLEEP_TIME=5 # How long program waits between pings +SLEEP_TIME=9 # How long program waits between pings MESSAGE="SuperDARN Borealis Alert:\n\n${RADARID}borealis is down\n\nSent from $(hostname)" @@ -51,7 +51,7 @@ printf "\n\n" while true; do # If ping fails, Borealis is unreachable and return value will be 1 - ping -c1 -W1 192.155.242.156 # $BOREALIS_IP + ping -c1 -W1 $BOREALIS_IP if [[ $? -eq 0 ]]; then MESSAGE_SENT="false" PING_FAIL_COUNT=0 From e57735ed19d00d481e4bea8ce70b26eda2d6f939 Mon Sep 17 00:00:00 2001 From: Theodore Kolkman Date: Mon, 12 Dec 2022 13:45:12 -0600 Subject: [PATCH 6/8] Added logging --- telemetry/alerts/borealis_down | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/telemetry/alerts/borealis_down b/telemetry/alerts/borealis_down index 8cb1d17..dfcaeb2 100755 --- a/telemetry/alerts/borealis_down +++ b/telemetry/alerts/borealis_down @@ -43,8 +43,12 @@ SLEEP_TIME=9 # How long program waits between pings MESSAGE="SuperDARN Borealis Alert:\n\n${RADARID}borealis is down\n\nSent from $(hostname)" +LOGFILE="${HOME}/logs/borealis_down.log" + ################################################################################################### +exec &>> $LOGFILE + printf "\nDaemon $0 starting on $(hostname) for ${RADARID}\n" date --utc "+%Y%m%d %H:%M:%S UTC" printf "\n\n" From ee41c9ff14083c0dfdc4ba8a1e5eedfc8c358283 Mon Sep 17 00:00:00 2001 From: Theodore Kolkman Date: Wed, 14 Dec 2022 16:31:16 -0600 Subject: [PATCH 7/8] Changed alerts to be sent via Slack instead of Text - Created config.sh file containing webhook urls for each site - Changed sending of text messages to instead send the message to the webhook. The messages are displayed on specific channels on Slack instead - Changed the threshold to only send messages after Borealis is unreachable for 10 minutes --- telemetry/alerts/borealis_down | 31 ++++++++++++++++--------------- telemetry/alerts/config.sh | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 15 deletions(-) create mode 100644 telemetry/alerts/config.sh diff --git a/telemetry/alerts/borealis_down b/telemetry/alerts/borealis_down index dfcaeb2..d43023d 100755 --- a/telemetry/alerts/borealis_down +++ b/telemetry/alerts/borealis_down @@ -2,10 +2,10 @@ # Copyright 2022 SuperDARN Canada, University of Saskatchewan # Author: Theodore Kolkman # -# Script that pings the Borealis computer, and sends a text alert if the computer is unreachable. -# This script should be run via site CD or DDS computer, so that text alerts are only sent if only -# Borealis is unreachable, but CD or DDS are still online. If the whole site is down, there is -# little we can do, and we don't want to be notified. +# Script that pings the Borealis computer, and sends a Slack message alert if the computer is +# unreachable. This script should be run via site CD or DDS computer, so that Slack alerts are only +# sent if only Borealis is unreachable, but CD or DDS are still online. If the whole site is down, +# there is little we can do, and we don't want to be notified. # # Dependencies: # - mutt (Installed with zypper) @@ -14,6 +14,7 @@ # - Email-to-text addresses defined correctly # - Line 'myorigin = smtp.servername.suffix' must be added to /etc/postfix/main.cf to stop cell # providers from blocking the computer sending via mutt +# - A valid Slack webhook must be defined in config.sh # # This script should be ran as a systemd daemon. An example .service file is shown below: # [Unit] @@ -31,17 +32,19 @@ ################################################################################################### source "${HOME}/.bashrc" +source "${HOME}/edtk/telemetry/alerts/config.sh" # Load in alert config file -# Define text numbers to alert here -# Correct cell provider email-to-sms addresses must be used too -ADDRESSES=(3062313083@sms.sasktel.com) +# Get the Slack webhook address to send alerts to +# This address specified what channel on Slack the alert will be sent to +HOOK_VARIABLE="${RADARID}_hook" # Ex) sas_hook +WEBHOOK="${!HOOK_VARIABLE}" # Ex) https://hooks... MESSAGE_SENT="false" # Flag to signal a text message has been sent PING_FAIL_COUNT=0 # Number of times ping has failed to reach borealis computer THRESHOLD=10 # Number of times ping must fail before message is sent -SLEEP_TIME=9 # How long program waits between pings +SLEEP_TIME=59 # How long program waits between pings in seconds -MESSAGE="SuperDARN Borealis Alert:\n\n${RADARID}borealis is down\n\nSent from $(hostname)" +MESSAGE=":rotating_light: Alert :rotating_light: ${RADARID}borealis is down" LOGFILE="${HOME}/logs/borealis_down.log" @@ -67,13 +70,11 @@ while true; do # Send text alert if Borealis unreachable, and an alert hasn't already been sent if [[ $PING_FAIL_COUNT -ge $THRESHOLD ]] && [[ $MESSAGE_SENT == "false" ]]; then date --utc "+%Y%m%d %H:%M:%S UTC" - echo "Sending text alerts to:" - printf '%s\n' "${ADDRESSES[@]}" - for address in $ADDRESSES; do - echo -e $MESSAGE | mutt -- $address - done + echo "Sending text alerts to: ${WEBHOOK}" + + curl --header "Content-type: application/json" --data "{'text':'${MESSAGE}'}" $WEBHOOK + MESSAGE_SENT="true" - PING_FAIL_COUNT=0 fi sleep $SLEEP_TIME printf "\n" diff --git a/telemetry/alerts/config.sh b/telemetry/alerts/config.sh new file mode 100644 index 0000000..424de36 --- /dev/null +++ b/telemetry/alerts/config.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Copyright 2022 SuperDARN Canada, University of Saskatchewan +# Author: Theodore Kolkman +# +# Configuration variables used by alert scripts + +################################################################################################### + +# Slack webhooks +# Used to send alerts to specific Slack channels +readonly sas_hook="https://hooks.slack.com/services/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" +readonly pgr_hook="https://hooks.slack.com/services/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" +readonly inv_hook="https://hooks.slack.com/services/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" +readonly cly_hook="https://hooks.slack.com/services/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" +readonly rkn_hook="https://hooks.slack.com/services/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" From 3ae97c4fedd06df0203ec21513bf8e50f2523e49 Mon Sep 17 00:00:00 2001 From: Theodore Kolkman Date: Wed, 14 Dec 2022 16:42:46 -0600 Subject: [PATCH 8/8] Removing Slack webhooks from GitHub --- telemetry/alerts/borealis_down | 11 ++++------- telemetry/alerts/config.sh | 15 --------------- 2 files changed, 4 insertions(+), 22 deletions(-) delete mode 100644 telemetry/alerts/config.sh diff --git a/telemetry/alerts/borealis_down b/telemetry/alerts/borealis_down index d43023d..29068e6 100755 --- a/telemetry/alerts/borealis_down +++ b/telemetry/alerts/borealis_down @@ -14,7 +14,7 @@ # - Email-to-text addresses defined correctly # - Line 'myorigin = smtp.servername.suffix' must be added to /etc/postfix/main.cf to stop cell # providers from blocking the computer sending via mutt -# - A valid Slack webhook must be defined in config.sh +# - SLACK_WEBHOOK defined in ~/.bashrc to a valid Slack webhook URL # # This script should be ran as a systemd daemon. An example .service file is shown below: # [Unit] @@ -34,10 +34,7 @@ source "${HOME}/.bashrc" source "${HOME}/edtk/telemetry/alerts/config.sh" # Load in alert config file -# Get the Slack webhook address to send alerts to -# This address specified what channel on Slack the alert will be sent to -HOOK_VARIABLE="${RADARID}_hook" # Ex) sas_hook -WEBHOOK="${!HOOK_VARIABLE}" # Ex) https://hooks... +# Get SLACK_WEBHOOK from .bashrc MESSAGE_SENT="false" # Flag to signal a text message has been sent PING_FAIL_COUNT=0 # Number of times ping has failed to reach borealis computer @@ -70,9 +67,9 @@ while true; do # Send text alert if Borealis unreachable, and an alert hasn't already been sent if [[ $PING_FAIL_COUNT -ge $THRESHOLD ]] && [[ $MESSAGE_SENT == "false" ]]; then date --utc "+%Y%m%d %H:%M:%S UTC" - echo "Sending text alerts to: ${WEBHOOK}" + echo "Sending text alerts to: ${SLACK_WEBHOOK}" - curl --header "Content-type: application/json" --data "{'text':'${MESSAGE}'}" $WEBHOOK + curl --header "Content-type: application/json" --data "{'text':'${MESSAGE}'}" $SLACK_WEBHOOK MESSAGE_SENT="true" fi diff --git a/telemetry/alerts/config.sh b/telemetry/alerts/config.sh deleted file mode 100644 index 424de36..0000000 --- a/telemetry/alerts/config.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -# Copyright 2022 SuperDARN Canada, University of Saskatchewan -# Author: Theodore Kolkman -# -# Configuration variables used by alert scripts - -################################################################################################### - -# Slack webhooks -# Used to send alerts to specific Slack channels -readonly sas_hook="https://hooks.slack.com/services/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" -readonly pgr_hook="https://hooks.slack.com/services/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" -readonly inv_hook="https://hooks.slack.com/services/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" -readonly cly_hook="https://hooks.slack.com/services/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" -readonly rkn_hook="https://hooks.slack.com/services/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"