diff --git a/NEWS.adoc b/NEWS.adoc index d62589886f..85b026a970 100644 --- a/NEWS.adoc +++ b/NEWS.adoc @@ -100,6 +100,17 @@ https://github.com/networkupstools/nut/milestone/12 * Introduced a `SHUTDOWN_HOSTSYNC` notification message, to report that the primary `upsmon` initiated the shutdown and has some secondaries to wait for first. [#3084] + * Make sure an `FSD` notification is issued for each UPS when this primary + `upsmon` instance sets it (and does not return to usual data processing + loop to see and report it like secondaries do). This allows a `NOTIFYCMD` + such as `upssched` on the primary to handle the pending power outage + (e.g. begin stopping heavy services) even while `upsmon` waits for the + secondaries to complete their shutdowns and log out of the `upsd` data + server. [issue #3003, PR #3110] ++ +NOTE: If using `upssched` and monitoring multiple UPSes, consider setting up +a `START-TIMER-SHARED` rule with a short (approx. 1 second) timeout to group +several `FSD` notifications into one executed action. [PR #3097] - `upssched` tool updates: * Previously in PR #2896 (NUT releases v2.8.3 and v2.8.4) the `UPSNAME` and @@ -120,6 +131,9 @@ https://github.com/networkupstools/nut/milestone/12 * Introduced `upssched -l` mode to list currently tracked timers. [#3097] * Make use of `setproctag()` and `getproctag()` to report parent/child process names. [#3084] + * Introduced optional passing of `NOTIFYMSG` text (normally originating + from `upsmon` which calls `upssched`) as an environment variable into + the ultimately executed `CMDSCRIPT` processes. [#3105] - `configure` script options: * Introduced `--with-python{,2,3}-modules-dir` to specify PyNUT(Client) diff --git a/clients/upsmon.c b/clients/upsmon.c index 8a862f3307..0d4f5d3c94 100644 --- a/clients/upsmon.c +++ b/clients/upsmon.c @@ -280,6 +280,7 @@ static unsigned __stdcall async_notify(LPVOID param) if (notifycmd != NULL) { snprintf(exec, sizeof(exec), "%s \"%s\"", notifycmd, data->notice); + upsdebugx(6, "%s: Calling NOTIFYCMD: %s", __func__, exec); if (data->upsname) setenv("UPSNAME", data->upsname, 1); else @@ -1132,8 +1133,20 @@ static void setfsd(utype_t *ups) return; } - if (!strncmp(buf, "OK", 2)) + if (!strncmp(buf, "OK", 2)) { + upsdebugx(1, "%s: data server confirmed setting FSD for UPS [%s]", __func__, ups->sys); + + /* Let NOTIFYCMD (if any) know, and have a chance to react */ + if (ups->lastfsdnotify) { + /* e.g. upsd was still alive with a latched FSD + * status when this upsmon instance started */ + upsdebugx(2, "%s: not notifying about FSD for UPS [%s] because it was recently reported already", __func__, ups->sys); + } else { + time(&(ups->lastfsdnotify)); + do_notify(ups, NOTIFY_FSD, NULL); + } return; + } /* protocol error: upsd said something other than "OK" */ upslogx(LOG_ERR, "FSD set on UPS %s failed: %s", ups->sys, buf); @@ -1842,9 +1855,12 @@ static void ups_fsd(utype_t *ups) upsdebugx(3, "%s: %s (first time)", __func__, ups->sys); - /* must have changed from !FSD to FSD, so notify */ + /* must have changed from !FSD to FSD, so notify; avoid duplicates though */ - do_notify(ups, NOTIFY_FSD, NULL); + if (!(ups->lastfsdnotify)) { + time(&(ups->lastfsdnotify)); + do_notify(ups, NOTIFY_FSD, NULL); + } setflag(&ups->status, ST_FSD); } @@ -2112,6 +2128,8 @@ static void addups(int reloading, const char *sys, const char *pvs, tmp->lastrbwarn = 0; tmp->lastncwarn = 0; + tmp->lastfsdnotify = 0; + tmp->offsince = 0; tmp->oblbsince = 0; tmp->oversince = 0; @@ -2867,8 +2885,10 @@ static void parse_status(utype_t *ups, char *status, char *buzzword, char *buzzw /* clear these out early if they disappear */ if (!strstr(status, "LB")) clearflag(&ups->status, ST_LOWBATT); - if (!strstr(status, "FSD")) + if (!strstr(status, "FSD")) { clearflag(&ups->status, ST_FSD); + ups->lastfsdnotify = 0; + } /* similar to above - clear these flags and send notifications */ if (!strstr(status, "CAL")) diff --git a/clients/upsmon.h b/clients/upsmon.h index 118f554a63..4007cb1b40 100644 --- a/clients/upsmon.h +++ b/clients/upsmon.h @@ -92,10 +92,12 @@ typedef struct { int pollfail_log_throttle_count; /* How many pollfreq loops this UPS was in this state since last logged report? */ time_t lastpoll; /* time of last successful poll */ - time_t lastnoncrit; /* time of last non-crit poll */ + time_t lastnoncrit; /* time of last non-crit poll */ time_t lastrbwarn; /* time of last REPLBATT warning*/ time_t lastncwarn; /* time of last NOCOMM warning */ + time_t lastfsdnotify; /* time of last FSD notification (when first discovering the state, or setting it - avoid duplicate notification); 0 initially or if that state clears */ + time_t offsince; /* time of recent entry into OFF state */ time_t oblbsince; /* time of recent entry into OB LB state (normally this causes immediate shutdown alert, unless we are configured to delay it) */ time_t oversince; /* time of recent entry into OVER state */ diff --git a/clients/upssched-cmd b/clients/upssched-cmd index 160f50584c..59ad5236d3 100755 --- a/clients/upssched-cmd +++ b/clients/upssched-cmd @@ -17,8 +17,8 @@ echo "`date -u`: $0: THIS IS A SAMPLE SCRIPT, PLEASE TAILOR IT FOR YOUR DEPLOYMENT OF NUT!" >&2 logger -t upssched-cmd "THIS IS A SAMPLE SCRIPT, PLEASE TAILOR IT FOR YOUR DEPLOYMENT OF NUT!" -printf "`date -u`: UPSNAME='%s'\tNOTIFYTYPE='%s'\targs=%s\n" "$UPSNAME" "$NOTIFYTYPE" "$@" >&2 -printf "UPSNAME='%s' NOTIFYTYPE='%s' args=%s\n" "$UPSNAME" "$NOTIFYTYPE" "$@" | logger -t upssched-cmd-received-NOTIFYTYPE +printf "`date -u`: UPSNAME='%s'\tNOTIFYTYPE='%s'\tNOTIFYMSG='%s'\targs=%s\n" "$UPSNAME" "$NOTIFYTYPE" "$NOTIFYMSG" "$*" >&2 +printf "UPSNAME='%s' NOTIFYTYPE='%s' NOTIFYMSG='%s' args=%s\n" "$UPSNAME" "$NOTIFYTYPE" "$NOTIFYMSG" "$*" | logger -t upssched-cmd-received-NOTIFYTYPE #set diff --git a/clients/upssched.c b/clients/upssched.c index d458315e22..fe8f914a12 100644 --- a/clients/upssched.c +++ b/clients/upssched.c @@ -28,6 +28,8 @@ /* design notes for the curious: * * 1. we get called with a ups_name and notify_type from upsmon + * (and notify_msg via first non-option argv[] element if + * present and not trivial) * 2. the config file is searched for an AT condition that matches * 3. the conditions on any matching lines are parsed * @@ -73,6 +75,7 @@ typedef struct ttype_s { time_t etime; char **upsnames; /* List of unique UPSNAME values that commanded to start this timer name */ char **notifytypes; /* List of unique NOTIFYTYPE values that commanded to start this timer name */ + char **notifymsgs; /* List of unique NOTIFYMSG values that commanded to start this timer name */ struct ttype_s *next; } ttype_t; @@ -83,7 +86,7 @@ static int nut_debug_level_args = 0, nut_debug_level_env = 0, nut_debug_level_co static int list_timers = 0; /* ups name and notify type (string) as received from upsmon */ -static const char *ups_name, *notify_type, *prog = NULL; +static const char *ups_name = NULL, *notify_type = NULL, *notify_msg = NULL, *prog = NULL; #ifdef WIN32 static OVERLAPPED connect_overlapped; @@ -221,8 +224,8 @@ static char* collect_string(char **string_arr, char *logtag, char *sep, size_t * static void exec_cmd_timer(ttype_t *item) { - char *upsnames = NULL, *notifytypes = NULL; - size_t upsnames_count = 0, notifytypes_count = 0; + char *upsnames = NULL, *notifytypes = NULL, *notifymsgs = NULL; + size_t upsnames_count = 0, notifytypes_count = 0, notifymsgs_count = 0; if (!item || !item->name || !(*(item->name))) { upsdebugx(1, "%s: SKIP bad call with null arg or its command name", __func__); @@ -238,15 +241,22 @@ static void exec_cmd_timer(ttype_t *item) notifytypes = collect_string(item->notifytypes, "NOTIFYTYPE", ",", NULL, ¬ifytypes_count); } + if (item->notifymsgs && *(item->notifymsgs) && **(item->notifymsgs)) { + notifymsgs = collect_string(item->notifymsgs, "NOTIFYMSG", ".\t", NULL, ¬ifymsgs_count); + } + if (upsnames) setenv("UPSNAME", upsnames, 1); if (notifytypes) setenv("NOTIFYTYPE", notifytypes, 1); + if (notifymsgs) + setenv("NOTIFYMSG", notifymsgs, 1); + if (nut_debug_level) - upslogx(LOG_INFO, "Executing command by timer: %s\t[%s]\t[%s]", - item->name, NUT_STRARG(notifytypes), NUT_STRARG(upsnames)); + upslogx(LOG_INFO, "Executing command by timer: %s\t[%s]\t[%s]\t[%s]", + item->name, NUT_STRARG(notifytypes), NUT_STRARG(upsnames), NUT_STRARG(notifymsgs)); exec_cmd(item->name); upsdebugx(3, "%s: returned from exec_cmd()", __func__); @@ -261,6 +271,11 @@ static void exec_cmd_timer(ttype_t *item) free(notifytypes); } + if (notifymsgs) { + unsetenv("NOTIFYMSG"); + free(notifymsgs); + } + upsdebugx(3, "%s: done", __func__); } @@ -281,7 +296,7 @@ static void removetimer(ttype_t *tfind) if (tmp->upsnames) { char **ps; - for (ps = tmp->upsnames; ps != NULL && *ps != NULL; ps++) { + for (ps = tmp->upsnames; *ps != NULL; ps++) { free(*ps); } free(tmp->upsnames); @@ -289,12 +304,20 @@ static void removetimer(ttype_t *tfind) if (tmp->notifytypes) { char **ps; - for (ps = tmp->notifytypes; ps != NULL && *ps != NULL; ps++) { + for (ps = tmp->notifytypes; *ps != NULL; ps++) { free(*ps); } free(tmp->notifytypes); } + if (tmp->notifymsgs) { + char **ps; + for (ps = tmp->notifymsgs; *ps != NULL; ps++) { + free(*ps); + } + free(tmp->notifymsgs); + } + upsdebugx(3, "%s: forgetting %s", __func__, tmp->name); free(tmp->name); free(tmp); @@ -367,7 +390,7 @@ static void checktimers(void) upsdebugx(3, "%s: done", __func__); } -static void start_timer(const char *name, const char *ofsstr, const char *notifytype, const char *upsname, int shared_timer) +static void start_timer(const char *name, const char *ofsstr, const char *notifytype, const char *upsname, const char *notifymsg, int shared_timer) { time_t now; long ofs; @@ -393,8 +416,8 @@ static void start_timer(const char *name, const char *ofsstr, const char *notify while (tmp) { if (tmp->name && !strcmp(tmp->name, name)) { if (nut_debug_level) - upslogx(LOG_INFO, "Append data to shared timer: %s\t[%s]\t[%s]\t(will elapse in %g seconds)", - name, NUT_STRARG(notifytype), NUT_STRARG(upsname), + upslogx(LOG_INFO, "Append data to shared timer: %s\t[%s]\t[%s]\t[%s]\t(will elapse in %g seconds)", + name, NUT_STRARG(notifytype), NUT_STRARG(upsname), NUT_STRARG(notifymsg), difftime(tmp->etime, now)); /* FIXME? Consider only the first hit as the shared timer? @@ -405,13 +428,13 @@ static void start_timer(const char *name, const char *ofsstr, const char *notify char **ps = NULL; size_t count = 0; /* amount of non-NULL entries, if we get to the end */ - for (ps = tmp->notifytypes; ps != NULL && *ps != NULL ; ps++) { + for (ps = tmp->notifytypes; *ps != NULL ; ps++) { count++; if (!strcmp(*ps, notifytype)) break; } - if (ps == NULL || *ps == NULL) { + if (*ps == NULL) { tmp->notifytypes = xrealloc(tmp->notifytypes, count + 2); tmp->notifytypes[count] = xstrdup(notifytype); tmp->notifytypes[count + 1] = NULL; @@ -423,18 +446,41 @@ static void start_timer(const char *name, const char *ofsstr, const char *notify } } + if (notifymsg && *notifymsg) { + if (tmp->notifymsgs) { + char **ps = NULL; + size_t count = 0; /* amount of non-NULL entries, if we get to the end */ + + for (ps = tmp->notifymsgs; *ps != NULL ; ps++) { + count++; + if (!strcmp(*ps, notifymsg)) + break; + } + + if (*ps == NULL) { + tmp->notifymsgs = xrealloc(tmp->notifymsgs, count + 2); + tmp->notifymsgs[count] = xstrdup(notifymsg); + tmp->notifymsgs[count + 1] = NULL; + } + } else { + tmp->notifymsgs = xcalloc(2, sizeof(char*)); + tmp->notifymsgs[0] = xstrdup(notifymsg); + tmp->notifymsgs[1] = NULL; + } + } + if (upsname && *upsname) { if (tmp->upsnames) { char **ps = NULL; size_t count = 0; /* amount of non-NULL entries, if we get to the end */ - for (ps = tmp->upsnames; ps != NULL && *ps != NULL ; ps++) { + for (ps = tmp->upsnames; *ps != NULL ; ps++) { count++; if (!strcmp(*ps, upsname)) break; } - if (ps == NULL || *ps == NULL) { + if (*ps == NULL) { tmp->upsnames = xrealloc(tmp->upsnames, count + 2); tmp->upsnames[count] = xstrdup(upsname); tmp->upsnames[count + 1] = NULL; @@ -456,8 +502,8 @@ static void start_timer(const char *name, const char *ofsstr, const char *notify } if (nut_debug_level) - upslogx(LOG_INFO, "New timer: %s\t[%s]\t[%s]\t(will elapse in %ld seconds)", - name, NUT_STRARG(notifytype), NUT_STRARG(upsname), ofs); + upslogx(LOG_INFO, "New timer: %s\t[%s]\t[%s]\t[%s]\t(will elapse in %ld seconds)", + name, NUT_STRARG(notifytype), NUT_STRARG(upsname), NUT_STRARG(notifymsg), ofs); /* now add to the queue */ if (!shared_timer) { @@ -473,6 +519,7 @@ static void start_timer(const char *name, const char *ofsstr, const char *notify tmp->name = xstrdup(name); tmp->etime = now + ofs; tmp->notifytypes = NULL; + tmp->notifymsgs = NULL; tmp->upsnames = NULL; tmp->next = NULL; @@ -482,6 +529,12 @@ static void start_timer(const char *name, const char *ofsstr, const char *notify tmp->notifytypes[1] = NULL; } + if (notifymsg && *notifymsg) { + tmp->notifymsgs = xcalloc(2, sizeof(char*)); + tmp->notifymsgs[0] = xstrdup(notifymsg); + tmp->notifymsgs[1] = NULL; + } + if (upsname && *upsname) { tmp->upsnames = xcalloc(2, sizeof(char*)); tmp->upsnames[0] = xstrdup(upsname); @@ -494,7 +547,7 @@ static void start_timer(const char *name, const char *ofsstr, const char *notify thead = tmp; } -static void cancel_timer(const char *name, const char *cname, const char *notifytype, const char *upsname, int do_cancel_matched) +static void cancel_timer(const char *name, const char *cname, const char *notifytype, const char *upsname, const char *notifymsg, int do_cancel_matched) { ttype_t *tmp; size_t removed = 0; @@ -505,6 +558,7 @@ static void cancel_timer(const char *name, const char *cname, const char *notify for (tmp = thead; tmp != NULL; tmp = tmp->next) { if (!strcmp(tmp->name, name)) { /* match */ + /* Note we do not match "notifymsg" as it likely differs */ if (!do_cancel_matched || ( (!notifytype || !(*notifytype)) && (!upsname || !(*upsname)) ) @@ -522,7 +576,7 @@ static void cancel_timer(const char *name, const char *cname, const char *notify upsdebugx(2, "%s: do not cancel timer %s due to lack of NOTIFYTYPE in it", __func__, name); continue; } - for (ps = tmp->notifytypes; ps != NULL && *ps != NULL ; ps++) { + for (ps = tmp->notifytypes; *ps != NULL ; ps++) { if (!strcmp(*ps, notifytype)) { matched = 1; break; @@ -540,7 +594,7 @@ static void cancel_timer(const char *name, const char *cname, const char *notify upsdebugx(2, "%s: do not cancel timer %s due to lack of UPSNAME in it", __func__, name); continue; } - for (ps = tmp->upsnames; ps != NULL && *ps != NULL ; ps++) { + for (ps = tmp->upsnames; *ps != NULL ; ps++) { if (!strcmp(*ps, upsname)) { matched = 1; break; @@ -553,8 +607,13 @@ static void cancel_timer(const char *name, const char *cname, const char *notify } } - if (nut_debug_level) - upslogx(LOG_INFO, "Cancelling timer: %s", name); + if (nut_debug_level) { + if (notifymsg && *notifymsg) { + upslogx(LOG_INFO, "Cancelling timer: %s: %s", name, notifymsg); + } else { + upslogx(LOG_INFO, "Cancelling timer: %s", name); + } + } removetimer(tmp); removed++; @@ -969,7 +1028,7 @@ static int sock_arg(conn_t *conn) /* LIST-TIMERS (no args expected now) * returns a list with tab-separated values for: - * NAME TO_ABS TO_REL NOTIFYTYPES UPSNAMES + * NAME TO_ABS TO_REL NOTIFYTYPES UPSNAMES NOTIFYMSGS_TABSEP */ if (!strcmp(conn->ctx.arglist[0], "LIST-TIMERS")) { ttype_t *item = thead; @@ -1003,6 +1062,20 @@ static int sock_arg(conn_t *conn) s = collect_string(item->upsnames, "UPSNAME", ",", NULL, NULL); } + if (s && *s) { + send_to_one(conn, "%s\t", s); + } else { + send_to_one(conn, "\"\"\t"); + } + if (s) { + free(s); + } + + s = NULL; + if (item->notifymsgs && *(item->notifymsgs) && **(item->notifymsgs)) { + s = collect_string(item->notifymsgs, "NOTIFYMSG", ".\t", NULL, NULL); + } + if (s && *s) { send_to_one(conn, "%s\n", s); } else { @@ -1020,7 +1093,7 @@ static int sock_arg(conn_t *conn) return 1; } - /* CANCEL [] [ [] [ ] */ { /* scoping */ int do_cancel = !strcmp(conn->ctx.arglist[0], "CANCEL"), do_cancel_matched = !strcmp(conn->ctx.arglist[0], "CANCEL-MATCHED"); @@ -1028,34 +1101,39 @@ static int sock_arg(conn_t *conn) if (do_cancel || do_cancel_matched) { /* "cmd" may be present and empty, this is handled in the method */ if (conn->ctx.numargs < 3) - cancel_timer(conn->ctx.arglist[1], NULL, NULL, NULL, do_cancel_matched); + cancel_timer(conn->ctx.arglist[1], NULL, + NULL, NULL, NULL, do_cancel_matched); else - if (conn->ctx.numargs < 5) - cancel_timer(conn->ctx.arglist[1], conn->ctx.arglist[2], NULL, NULL, do_cancel_matched); + if (conn->ctx.numargs < 6) + cancel_timer(conn->ctx.arglist[1], conn->ctx.arglist[2], + NULL, NULL, NULL, do_cancel_matched); else cancel_timer(conn->ctx.arglist[1], conn->ctx.arglist[2], - conn->ctx.arglist[3], conn->ctx.arglist[4], do_cancel_matched); + conn->ctx.arglist[3], conn->ctx.arglist[4], + conn->ctx.arglist[5], do_cancel_matched); send_to_one(conn, "OK\n"); return 1; } } - if (conn->ctx.numargs < 5) + if (conn->ctx.numargs < 6) return 0; - /* START */ + /* START */ if (!strcmp(conn->ctx.arglist[0], "START")) { start_timer(conn->ctx.arglist[1], conn->ctx.arglist[2], - conn->ctx.arglist[3], conn->ctx.arglist[4], 0); + conn->ctx.arglist[3], conn->ctx.arglist[4], + conn->ctx.arglist[5], 0); send_to_one(conn, "OK\n"); return 1; } - /* START-SHARED */ + /* START-SHARED */ if (!strcmp(conn->ctx.arglist[0], "START-SHARED")) { start_timer(conn->ctx.arglist[1], conn->ctx.arglist[2], - conn->ctx.arglist[3], conn->ctx.arglist[4], 1); + conn->ctx.arglist[3], conn->ctx.arglist[4], + conn->ctx.arglist[5], 1); send_to_one(conn, "OK\n"); return 1; } @@ -1293,6 +1371,7 @@ static void start_daemon(TYPE_FD lockfd) * CMDSCRIPT to run */ unsetenv("NOTIFYTYPE"); unsetenv("UPSNAME"); + unsetenv("NOTIFYMSG"); /* now watch for activity */ upsdebugx(2, "Timer daemon waiting for connections on pipefd %d", @@ -1404,6 +1483,7 @@ static void start_daemon(TYPE_FD lockfd) * CMDSCRIPT to run */ unsetenv("NOTIFYTYPE"); unsetenv("UPSNAME"); + unsetenv("NOTIFYMSG"); /* now watch for activity */ @@ -1587,7 +1667,7 @@ static void sendcmd(const char *cmd, const char *arg1, const char *arg2) int i; ssize_t ret; size_t enclen, buflen; - char buf[SMALLBUF], enc[SMALLBUF + 8]; + char buf[LARGEBUF], enc[LARGEBUF + 8]; #ifndef WIN32 int ret_s; struct timeval tv; @@ -1621,6 +1701,9 @@ static void sendcmd(const char *cmd, const char *arg1, const char *arg2) snprintfcat(buf, sizeof(buf), " \"%s\"", ups_name? pconf_encode(ups_name, enc, sizeof(enc)) : ""); + snprintfcat(buf, sizeof(buf), " \"%s\"", + notify_msg ? pconf_encode(notify_msg, enc, sizeof(enc)) : ""); + snprintf(enc, sizeof(enc), "%s\n", buf); /* Sanity checks, for static analyzers to sleep well */ @@ -1895,33 +1978,37 @@ static void parse_at(const char *ntype, const char *un, const char *cmd, /* if command is valid, send it to the daemon (which may start it) */ if (!strcmp(cmd, "START-TIMER")) { - upsdebugx(1, "%s: processing %s\t[%s]\t[%s]\t[%s]\t[%s]", __func__, cmd, + upsdebugx(1, "%s: processing %s\t[%s]\t[%s]\t[%s]\t[%s]\t[%s]", __func__, cmd, NUT_STRARG(ca1), NUT_STRARG(ca2), - NUT_STRARG(notify_type), NUT_STRARG(ups_name)); + NUT_STRARG(notify_type), NUT_STRARG(ups_name), + NUT_STRARG(notify_msg)); sendcmd("START", ca1, ca2); return; } if (!strcmp(cmd, "START-TIMER-SHARED")) { - upsdebugx(1, "%s: processing %s\t[%s]\t[%s]\t[%s]\t[%s]", __func__, cmd, + upsdebugx(1, "%s: processing %s\t[%s]\t[%s]\t[%s]\t[%s]\t[%s]", __func__, cmd, NUT_STRARG(ca1), NUT_STRARG(ca2), - NUT_STRARG(notify_type), NUT_STRARG(ups_name)); + NUT_STRARG(notify_type), NUT_STRARG(ups_name), + NUT_STRARG(notify_msg)); sendcmd("START-SHARED", ca1, ca2); return; } if (!strcmp(cmd, "CANCEL-TIMER")) { - upsdebugx(1, "%s: processing %s\t[%s]\t[%s]\t[%s]\t[%s]", __func__, cmd, + upsdebugx(1, "%s: processing %s\t[%s]\t[%s]\t[%s]\t[%s]\t[%s]", __func__, cmd, NUT_STRARG(ca1), NUT_STRARG(ca2), - NUT_STRARG(notify_type), NUT_STRARG(ups_name)); + NUT_STRARG(notify_type), NUT_STRARG(ups_name), + NUT_STRARG(notify_msg)); sendcmd("CANCEL", ca1, ca2); return; } if (!strcmp(cmd, "EXECUTE")) { - upsdebugx(1, "%s: processing %s\t[%s]\t[%s]\t[%s]\t[%s]", __func__, cmd, + upsdebugx(1, "%s: processing %s\t[%s]\t[%s]\t[%s]\t[%s]\t[%s]", __func__, cmd, NUT_STRARG(ca1), NUT_STRARG(ca2), - NUT_STRARG(notify_type), NUT_STRARG(ups_name)); + NUT_STRARG(notify_type), NUT_STRARG(ups_name), + NUT_STRARG(notify_msg)); if (ca1[0] == '\0') { upslogx(LOG_ERR, "Empty EXECUTE command argument"); @@ -1935,9 +2022,10 @@ static void parse_at(const char *ntype, const char *un, const char *cmd, return; } - upslogx(LOG_ERR, "Invalid command: %s\t[%s]\t[%s]\t[%s]\t[%s]", cmd, + upslogx(LOG_ERR, "Invalid command: %s\t[%s]\t[%s]\t[%s]\t[%s]\t[%s]", cmd, NUT_STRARG(ca1), NUT_STRARG(ca2), - NUT_STRARG(notify_type), NUT_STRARG(ups_name)); + NUT_STRARG(notify_type), NUT_STRARG(ups_name), + NUT_STRARG(notify_msg)); } static int conf_arg(size_t numargs, char **arg) @@ -2085,7 +2173,7 @@ static void help(const char *arg_progname) printf("upssched: upsmon's scheduling helper for offset timers\n"); printf("Practical behavior is managed by UPSNAME and NOTIFYTYPE envvars\n"); - printf("\nUsage: %s [OPTIONS]\n\n", arg_progname); + printf("\nUsage: %s [OPTIONS] [NOTIFYMSG]\n\n", arg_progname); printf(" -D raise debugging level (NOTE: keeps reporting when daemonized)\n"); printf(" -V display the version of this software\n"); printf(" -h display this help\n"); @@ -2100,7 +2188,7 @@ static void help(const char *arg_progname) int main(int argc, char **argv) { - int i; + int i, argn = 0; if (argc > 0) prog = xbasename(argv[0]); @@ -2108,6 +2196,7 @@ int main(int argc, char **argv) prog = "upssched"; while ((i = getopt(argc, argv, "+DVhl")) != -1) { + argn++; switch (i) { case 'D': nut_debug_level_args++; @@ -2153,6 +2242,9 @@ int main(int argc, char **argv) ups_name = getenv("UPSNAME"); notify_type = getenv("NOTIFYTYPE"); + upsdebugx(2, "Remaining argn=%d of argc=%d", argn, argc); + if (argc > argn + 1 && *argv[argn + 1]) + notify_msg = argv[argn + 1]; if ((!list_timers) && ((!ups_name) || (!notify_type))) { printf("Error: environment variables UPSNAME and NOTIFYTYPE must be set.\n"); @@ -2160,6 +2252,12 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } + upsdebugx(1, "Handling NOTIFYTYPE='%s' for UPSNAME='%s'", notify_type, ups_name); + if (notify_msg) + upsdebugx(1, "Got a NOTIFYMSG from command line: %s", notify_msg); + else + upsdebugx(1, "Did not get any NOTIFYMSG from command line"); + /* see if this matches anything in the config file */ /* This is actually the processing loop: * checkconf -> conf_arg -> parse_at -> sendcmd -> daemon if needed diff --git a/conf/upsmon.conf.sample.in b/conf/upsmon.conf.sample.in index 0dc09dc48c..1a0be42969 100644 --- a/conf/upsmon.conf.sample.in +++ b/conf/upsmon.conf.sample.in @@ -402,6 +402,7 @@ POWERDOWNFLAG "@POWERDOWNFLAG@" # NOTIFYFLAG COMMOK SYSLOG+WALL # NOTIFYFLAG COMMBAD SYSLOG+WALL # NOTIFYFLAG SHUTDOWN SYSLOG+WALL +# NOTIFYFLAG SHUTDOWN_HOSTSYNC SYSLOG+WALL # NOTIFYFLAG REPLBATT SYSLOG+WALL # NOTIFYFLAG NOCOMM SYSLOG+WALL # NOTIFYFLAG NOPARENT SYSLOG+WALL @@ -413,9 +414,17 @@ POWERDOWNFLAG "@POWERDOWNFLAG@" # NOTIFYFLAG NOTBYPASS SYSLOG+WALL # NOTIFYFLAG ECO SYSLOG+WALL # NOTIFYFLAG NOTECO SYSLOG+WALL +# # NOTIFYFLAG ALARM SYSLOG+WALL # NOTIFYFLAG NOTALARM SYSLOG+WALL # +# NOTIFYFLAG OVER SYSLOG+WALL +# NOTIFYFLAG NOTOVER SYSLOG+WALL +# NOTIFYFLAG TRIM SYSLOG+WALL +# NOTIFYFLAG NOTTRIM SYSLOG+WALL +# NOTIFYFLAG BOOST SYSLOG+WALL +# NOTIFYFLAG NOTBOOST SYSLOG+WALL +# # NOTIFYFLAG OTHER SYSLOG+WALL # NOTIFYFLAG NOTOTHER SYSLOG+WALL # diff --git a/conf/upssched.conf.sample.in b/conf/upssched.conf.sample.in index 1d2961a1ce..bfb9b99f68 100644 --- a/conf/upssched.conf.sample.in +++ b/conf/upssched.conf.sample.in @@ -118,10 +118,10 @@ CMDSCRIPT @BINDIR@/upssched-cmd # Start a timer called that will trigger after # seconds, calling your CMDSCRIPT with as the first # argument. Each invocation checks if the was already -# started, and if so -- appends the current event's `UPSNAME` and -# `NOTIFYTYPE` to the list of unique values it would report via -# environment variables (as a comma-separated string) when the -# timer does execute. +# started, and if so -- appends the current event's `UPSNAME`, +# `NOTIFYTYPE` and `NOTIFYMSG` to the list of unique values it would +# report via environment variables (as a comma-separated string) when +# the timer does execute. # # NOTE: Currently this updates the first seen instance with the # (in case you managed to start many). @@ -156,7 +156,8 @@ CMDSCRIPT @BINDIR@/upssched-cmd # - CANCEL-TIMER-MATCHED [cmd] # # Similar to the above, but tries to only cancel the if it -# refers to the `UPSNAME` and `NOTIFYTYPE` values passed by caller. +# refers to the `UPSNAME` and `NOTIFYTYPE` values passed by caller (the +# `NOTIFYMSG` is ignored in this context). # # 1) If any UPS (*) reverts to utility power, then stop the timer before it # triggers ONLY if that UPS is associated with the already scheduled timer: diff --git a/docs/man/upsmon.txt b/docs/man/upsmon.txt index 521946b0dd..05c20e29c1 100644 --- a/docs/man/upsmon.txt +++ b/docs/man/upsmon.txt @@ -394,16 +394,16 @@ This design allows you to lose some of your power supplies in a redundant power environment without bringing down the entire system, while still working properly for smaller systems. -UPS TYPES ---------- +UPS CONNECTION TYPES AND UPSMON ROLES +------------------------------------- *upsmon* and linkman:upsd[8] don't always run on the same system. When they -do, any UPSes that are directly attached to the upsmon host should be -monitored in "primary" mode. This makes upsmon take charge of that equipment, -and it will wait for the "secondary" systems to disconnect before shutting -down the local system. This allows the distant systems (monitoring over -the network) to shut down cleanly before `upsdrvctl shutdown` runs locally -and turns them all off. +do, any UPSes that are directly attached to that upsmon host should be +monitored in "primary" mode, which makes that upsmon instance take charge +of that equipment, and it will wait for the "secondary" systems to disconnect +before shutting down the local system. This allows the distant systems (just +monitoring over the network) to shut down cleanly before `upsdrvctl shutdown` +runs locally on the primary system and turns them all off. When upsmon runs as a secondary, it is relying on the distant system to tell it about the state of the UPS. When that UPS goes critical (on battery @@ -423,7 +423,11 @@ should break somehow. This defaults to 15 seconds. If your primary system is shutting down too quickly, set the FINALDELAY interval to something greater than the default 15 seconds. Don't set this too high, or your UPS battery may run out of power before the -primary upsmon process shuts down that system. +primary upsmon process shuts down that system. If you do need more time, +consider starting the shutdown after a short time on battery, for details +see the Timed Shutdowns section. + +For a more technical take, please see the Shutdown Activity Workflow section. TIMED SHUTDOWNS --------------- @@ -500,6 +504,8 @@ by starting another copy of the program with `-c fsd` command line argument. This is useful when you want to initiate a shutdown before the critical stage through some external means, such as linkman:upssched[8]. +For a more technical take, please see the Shutdown Activity Workflow section. + WARNING: Please note that by design, since we require power-cycling the load and don't want some systems to be powered off while others remain running if the "wall power" returns at the wrong moment as usual, the "FSD" @@ -542,11 +548,66 @@ crawling under a desk to find the plug. Note you can also use a dummy SHUTDOWNCMD setting to just report that the systems would shut down at this point, without actually disrupting their work. +For inspiration, you can see the setup done by the NUT Integration Tests suite +under the `tests/NIT` directory in NUT sources, including references to the +shutdown and notification scripts which only log the activity (you may have +to configure at least a trivial NUT build and run `make check-NIT-sandbox` to +generate some of the configuration files -- or inspect the `nit.sh` script +which pieces them together). Notably, see `scripts/misc/notifyme-debug` as +not only a logger, but optionally a wrapper for `upssched` (in test runs), +and `clients/upssched-cmd` as a sample implementation of an linkman:upssched[8] +`CMDSCRIPT` which also focuses on logging. + WARNING: After such "dummy" experiments you may have to restart the NUT data server `upsd` to clear its "FSD" flag for the devices and clients involved, and make sure no files named by `POWERDOWNFLAG` option (e.g. `/etc/killpower`) remain on the `upsmon primary` systems under test. +SHUTDOWN ACTIVITY WORKFLOW +-------------------------- + +Looking into `clients/upsmon.c` sources as the ultimate authority, you +can find that the chain of events during a forced shutdown is. This can +help make sense of the timing variables involved, and notifications sent +(which you may want to handle, perhaps with linkman:upssched[8]): + +* The path to shutdown activity of a host starts when its locally running + `upsmon` client (in any role) decides the power situation is critical, + e.g. by having too few "healthy" power supplies in a real outage, and/or + by seeing `FSD` among `ups.status` tokens -- possibly still "latched" in + the `upsd` data server while you start a new `upsmon` instance, or when + you call `upsmon -c fsd` on that system to simulate the outage; +* Such `upsmon` instance ends up in `forceshutdown()` method; +* There it loops over all UPSes it `MONITOR`s as a `primary`, and calls the + `setfsd()` method for each (causing a local `FSD` notification, if one was + not sent earlier); +* If there were no such UPSes -- we are a secondary, and go into `doshutdown()` + method immediately +* Otherwise we are a primary, and only go into `doshutdown()` method after + first completing the `sync_secondaries()` method, which: + * runs an infinite loop until either there are no other persistent clients + logged on to the data server (`upsd`) for each UPS we are a primary for, + or until `HOSTSYNC` timeout elapses; + * it should issue a `SHUTDOWN_HOSTSYNC` notification if it is going to wait + at all (if there were other clients seen on first loop cycle). +* Finally, in the `doshutdown()` method, it: + * issues a `SHUTDOWN` notification; + * waits for `FINALDELAY`; + * starts the timer for `SHUTDOWNEXIT` (which may be used to force `upsmon` + process to linger after calling `SHUTDOWNCMD` -- e.g. can be used by a + secondary to block the primary from cutting power too early for some use + cases, like safely parking some external machinery); + * calls the `SHUTDOWNCMD` (either directly in mono-process mode, or by + telling the `root`-privileged part to do so in the common case); + * optionally linger, if `SHUTDOWNEXIT` is so configured; + * ultimately exit the daemon (also causes the `root`-privileged part to exit, + by breaking the communications pipe between them). + +Note that if your `upsmon` does split into privileged and unprivileged parts, +all notifications run in the unprivileged context (your handling scripts may +have to `sudo` explicitly if/when/where you want to do something to the system). +Only `SHUTDOWNCMD` is called in privileged context. + DEAD UPSES ---------- diff --git a/docs/man/upssched.conf.txt b/docs/man/upssched.conf.txt index 33d3b2ca57..b0d38f04c8 100644 --- a/docs/man/upssched.conf.txt +++ b/docs/man/upssched.conf.txt @@ -18,6 +18,13 @@ IMPORTANT NOTES * Contents of this file should be pure ASCII (character codes not in range would be ignored with a warning message). +* Command execution is synchronous (with the called tool process in case + of `EXECUTE` directive, or with the timer process). Consider using your + system shell abilities like `&` to send long-duration handling to the + background and let `upssched` timer daemon continue. This should not + impact `upsmon` daemon, which handles each notification in a separate + sub-process (and so not a problem for immediate `EXECUTE` events). + CONFIGURATION DIRECTIVES ------------------------ @@ -98,10 +105,11 @@ gone for 10 seconds Start a timer of 'interval' seconds. When it triggers, it will pass the argument 'timername' as an argument to your CMDSCRIPT. Each invocation checks if the 'timername' was already -started, and if so -- appends the current event's `UPSNAME` and -`NOTIFYTYPE` to the list of unique values it would report via -environment variables (as a comma-separated string) when the -timer does execute. +started, and if so -- appends the current event's `UPSNAME`, +`NOTIFYTYPE` and `NOTIFYMSG` to the list of unique values it +would report via environment variables (as a comma-separated +string for `UPSNAME` and `NOTIFYTYPE`, and tab-separated +sentences for `NOTIFYMSG`) when the timer does execute. + NOTE: Currently this updates the first seen instance with the 'timername' (in case you managed to start many). @@ -131,6 +139,7 @@ stop the timer before it triggers *CANCEL-TIMER-MATCHED* 'timername' ['cmd'];; Similar to the above, but tries to only cancel the 'timername' if it refers to the `UPSNAME` and `NOTIFYTYPE` values passed by caller. +The `NOTIFYMSG` is ignored in this context. + Example: + diff --git a/docs/man/upssched.txt b/docs/man/upssched.txt index ef8ebd0bd9..3b790fbb0f 100644 --- a/docs/man/upssched.txt +++ b/docs/man/upssched.txt @@ -9,7 +9,7 @@ upssched - Timer helper for scheduling events from upsmon SYNOPSIS -------- -*upssched* +*upssched* [OPTIONS] [NOTIFYMSG] NOTE: *upssched* should be run from linkman:upsmon[8] via the NOTIFYCMD. You should never run it directly during normal operations. @@ -17,7 +17,8 @@ You should never run it directly during normal operations. *upssched* -l List currently tracked timer events, if any. Report as a TAB-separated -table of: 'NAME', 'TIMEOUT_ABS', 'TIMEOUT_REL', 'NOTIFYTYPE', 'UPSNAME'. +table of: 'NAME', 'TIMEOUT_ABS', 'TIMEOUT_REL', 'NOTIFYTYPE', 'UPSNAME', +`NOTIFYMSG`. DESCRIPTION ----------- @@ -27,6 +28,36 @@ relative to events being monitored by linkman:upsmon[8]. The original purpose was to allow for a shutdown to occur after some fixed period on battery, but there are other uses that are possible. +COMMON OPTIONS +-------------- + +*-h*:: +Show the command-line help message. + +*-V*:: +Show NUT version banner. More details may be available if you also +`export NUT_DEBUG_LEVEL=1` or greater verbosity level. + +*-D*:: +Raise the debugging level. Use this option multiple times for more details. + +OPTIONS +------- + +By default `upssched` processes its configuration file and executes or queues +calls to its `CMDSCRIPT`, or cancels some previously queued item(s), based on +configuration and the `NOTIFYTYPE` it receives. One exception to this is the +queue listing mode `-l`. + +*-l*:: +List pending timers (if any) and exit. + +*NOTIFYMSG*:: +Optionally pass a text message (typically originates from linkman:upsmon[8] +call to `upssched` as its `NOTIFYCMD`) as an environment variable named +`NOTIFYMSG` to the `CMDSCRIPT` launched by `upssched` immediately or after +a timer expires. + INTEGRATION ----------- @@ -50,6 +81,14 @@ If you also want to continue writing to the syslog, just add it in: For a full list of notify flags, see the linkman:upsmon[8] documentation. +Please note that command execution is synchronous (with the called `upssched` +tool process in case of `EXECUTE` directive, or with the timer process). +Consider using your system shell abilities like `&` to send long-duration +handling to the background and let `upssched` timer daemon continue. +This should not impact `upsmon` daemon, which handles each notification +in a separate sub-process (and so not a problem for immediate `EXECUTE` +events). + CONFIGURATION ------------- @@ -65,6 +104,24 @@ to shut down the slaves in a controlled manner. Be sure you cancel the timer if power returns (ONLINE). +EARLY PREPARATION FOR A SHUTDOWN ON UPSMON PRIMARY INSTANCE +----------------------------------------------------------- + +The linkman:upsmon[8] primary instance is responsible for telling the UPS(es) +to power off at the end of emergency shutdown. As such, if there are several +clients, the primary instance raises an "FSD" (Forced Shut Down) flag on the +data server for each UPS it manages, and waits for secondary instances to log +off (or for a timeout to expire). If there are activities that should happen +on the primary upsmon's computer during shutdown which take a long time, you +can use the `FSD` notification to begin those operations while the primary +`upsmon` instance waits for the secondaries to complete their shutdowns. + +If you have several UPSes, you may want to combine several notifications with +the `START-TIMER-SHARED` directive (with a short timeout), so you only react +once. Alternately, if the needed activity varies by the UPS (e.g. custom +remote-device shutdown scripts), you may actually want to use `EXECUTE` rules +right away (and dispatch further work in your `CMDSCRIPT`). + DEBOUNCING EVENTS ----------------- diff --git a/docs/nut.dict b/docs/nut.dict index aba9ef023f..af914be15d 100644 --- a/docs/nut.dict +++ b/docs/nut.dict @@ -1,4 +1,4 @@ -personal_ws-1.1 en 3555 utf-8 +personal_ws-1.1 en 3557 utf-8 AAC AAS ABI @@ -1961,6 +1961,7 @@ docinfo docs dod domxml +doshutdown dotnet downloadable dpkg @@ -3043,6 +3044,7 @@ servicebypass setFeature setaux setflags +setfsd setgid setinfo setpci diff --git a/scripts/misc/notifyme-debug b/scripts/misc/notifyme-debug index ea1ad37b09..550e62e4f2 100755 --- a/scripts/misc/notifyme-debug +++ b/scripts/misc/notifyme-debug @@ -24,5 +24,5 @@ if [ -n "${TOP_BUILDDIR}" -a -x "${TOP_BUILDDIR}/clients/upssched" ] ; then if [ "${NUT_DEBUG_LEVEL-}" -gt 0 ] 2>/dev/null ; then printf '%s: %s\t%s\t[%s]:\targs: %s\t(%s arg tokens)\n' "`date -u`" "$0" "${NOTIFYTYPE-}" "${UPSNAME-}" "$*" "$#" >&2 fi - "${TOP_BUILDDIR}/clients/upssched" + "${TOP_BUILDDIR}/clients/upssched" "$@" fi diff --git a/tests/NIT/upssched.conf.in b/tests/NIT/upssched.conf.in index abf5129f48..2a6d630c09 100644 --- a/tests/NIT/upssched.conf.in +++ b/tests/NIT/upssched.conf.in @@ -15,7 +15,7 @@ LOCKFN @NUT_STATEPATH@/upssched.lock # ============================================================================ # info2client -AT ONLINE * EXECUTE ONLINE +AT ONLINE * EXECUTE ONLINE-HANDLER # info2admin only AT ONLINE * CANCEL-TIMER BATT-STATUS-1s AT ONLINE * CANCEL-TIMER BATT-STATUS-2s @@ -41,7 +41,7 @@ AT ONLINE * START-TIMER LINE-STATUS-30 1800 AT ONLINE * START-TIMER LINE-STATUS-60 3600 # # info2client -AT ONBATT * EXECUTE ONBATT +AT ONBATT * EXECUTE ONBATT-HANDLER # info2admin only AT ONBATT * CANCEL-TIMER LINE-STATUS-1s AT ONBATT * CANCEL-TIMER LINE-STATUS-2s @@ -70,10 +70,11 @@ AT ONBATT * START-TIMER BATT-STATUS-60 3600 ######################### # info2client -AT REPLBATT * ONBATT * EXECUTE REPLBATT -AT NOCOMM * EXECUTE NOCOMM -AT FSD * EXECUTE FSD -AT SHUTDOWN * EXECUTE SHUTDOWN +AT REPLBATT * ONBATT * EXECUTE REPLBATT-HANDLER +AT NOCOMM * EXECUTE NOCOMM-HANDLER +AT FSD * EXECUTE FSD-HANDLER +AT SHUTDOWN * EXECUTE SHUTDOWN-HANDLER +AT SHUTDOWN_HOSTSYNC * EXECUTE SHUTDOWN_HOSTSYNC-HANDLER # info2admin only AT LOWBATT * EXECUTE LOWBATT-INFO