Skip to content

Commit 415a87d

Browse files
committed
Merge branch 'master' of github.com:ClusterLabs/pacemaker
2 parents 8abdd82 + ce2a422 commit 415a87d

33 files changed

+664
-318
lines changed

crmd/control.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -1046,14 +1046,14 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
10461046

10471047
value = crmd_pref(config_hash, "load-threshold");
10481048
if(value) {
1049-
throttle_load_target = strtof(value, NULL) / 100;
1049+
throttle_set_load_target(strtof(value, NULL) / 100.0);
10501050
}
10511051

10521052
value = crmd_pref(config_hash, "no-quorum-policy");
10531053
if (safe_str_eq(value, "suicide") && pcmk_locate_sbd()) {
10541054
no_quorum_suicide_escalation = TRUE;
10551055
}
1056-
1056+
10571057
value = crmd_pref(config_hash,"stonith-max-attempts");
10581058
update_stonith_max_attempts(value);
10591059

crmd/crmd_utils.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
# define CRMD_UTILS__H
2020

2121
# include <crm/crm.h>
22+
# include <crm/transition.h>
2223
# include <crm/common/xml.h>
2324
# include <crm/cib/internal.h> /* For CIB_OP_MODIFY */
2425
# include "crmd_alerts.h"
@@ -100,8 +101,10 @@ int crmd_join_phase_count(enum crm_join_phase phase);
100101
void crmd_join_phase_log(int level);
101102

102103
const char *get_timer_desc(fsa_timer_t * timer);
103-
gboolean too_many_st_failures(void);
104104
void st_fail_count_reset(const char * target);
105+
void st_fail_count_increment(const char *target);
106+
void abort_for_stonith_failure(enum transition_action abort_action,
107+
const char *target, xmlNode *reason);
105108
void crmd_peer_down(crm_node_t *peer, bool full);
106109

107110
/* Convenience macro for registering a CIB callback

crmd/election.c

-3
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,6 @@
3131
#include <crmd_callbacks.h>
3232
#include <tengine.h>
3333

34-
#define STORM_INTERVAL 2 /* in seconds */
35-
#define STORM_MULTIPLIER 5 /* multiplied by the number of nodes */
36-
3734
/* A_ELECTION_VOTE */
3835
void
3936
do_election_vote(long long action,

crmd/join_client.c

+24
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,26 @@ void join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, v
3030

3131
extern ha_msg_input_t *copy_ha_msg_input(ha_msg_input_t * orig);
3232

33+
/*!
34+
* \internal
35+
* \brief Remember if DC is shutting down as we join
36+
*
37+
* If we're joining while the current DC is shutting down, update its expected
38+
* state, so we don't fence it if we become the new DC. (We weren't a peer
39+
* when it broadcast its shutdown request.)
40+
*
41+
* \param[in] msg A join message from the DC
42+
*/
43+
static void
44+
update_dc_expected(xmlNode *msg)
45+
{
46+
if (fsa_our_dc && crm_is_true(crm_element_value(msg, F_CRM_DC_LEAVING))) {
47+
crm_node_t *dc_node = crm_get_peer(0, fsa_our_dc);
48+
49+
crm_update_peer_expected(__FUNCTION__, dc_node, CRMD_JOINSTATE_DOWN);
50+
}
51+
}
52+
3353
/* A_CL_JOIN_QUERY */
3454
/* is there a DC out there? */
3555
void
@@ -128,6 +148,8 @@ do_cl_join_offer_respond(long long action,
128148
return;
129149
}
130150

151+
update_dc_expected(input->msg);
152+
131153
CRM_LOG_ASSERT(input != NULL);
132154
query_call_id =
133155
fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local | cib_no_children);
@@ -250,6 +272,8 @@ do_cl_join_finalize_respond(long long action,
250272
return;
251273
}
252274

275+
update_dc_expected(input->msg);
276+
253277
/* send our status section to the DC */
254278
tmp1 = do_lrm_query(TRUE, fsa_our_uname);
255279
if (tmp1 != NULL) {

crmd/join_dc.c

+30-8
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,30 @@ initialize_join(gboolean before)
106106
}
107107
}
108108

109+
/*!
110+
* \internal
111+
* \brief Create a join message from the DC
112+
*
113+
* \param[in] join_op Join operation name
114+
* \param[in] host_to Recipient of message
115+
*/
116+
static xmlNode *
117+
create_dc_message(const char *join_op, const char *host_to)
118+
{
119+
xmlNode *msg = create_request(join_op, NULL, host_to, CRM_SYSTEM_CRMD,
120+
CRM_SYSTEM_DC, NULL);
121+
122+
/* Identify which election this is a part of */
123+
crm_xml_add_int(msg, F_CRM_JOIN_ID, current_join_id);
124+
125+
/* Add a field specifying whether the DC is shutting down. This keeps the
126+
* joining node from fencing the old DC if it becomes the new DC.
127+
*/
128+
crm_xml_add_boolean(msg, F_CRM_DC_LEAVING,
129+
is_set(fsa_input_register, R_SHUTDOWN));
130+
return msg;
131+
}
132+
109133
static void
110134
join_make_offer(gpointer key, gpointer value, gpointer user_data)
111135
{
@@ -147,10 +171,8 @@ join_make_offer(gpointer key, gpointer value, gpointer user_data)
147171

148172
crm_update_peer_join(__FUNCTION__, (crm_node_t*)member, crm_join_none);
149173

150-
offer = create_request(CRM_OP_JOIN_OFFER, NULL, member->uname,
151-
CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL);
174+
offer = create_dc_message(CRM_OP_JOIN_OFFER, member->uname);
152175

153-
crm_xml_add_int(offer, F_CRM_JOIN_ID, current_join_id);
154176
/* send the welcome */
155177
crm_info("join-%d: Sending offer to %s", current_join_id, member->uname);
156178

@@ -242,8 +264,10 @@ do_dc_join_offer_one(long long action,
242264
/* always offer to the DC (ourselves)
243265
* this ensures the correct value for max_generation_from
244266
*/
245-
member = crm_get_peer(0, fsa_our_uname);
246-
join_make_offer(NULL, member, NULL);
267+
if (strcmp(join_to, fsa_our_uname) != 0) {
268+
member = crm_get_peer(0, fsa_our_uname);
269+
join_make_offer(NULL, member, NULL);
270+
}
247271

248272
/* this was a genuine join request, cancel any existing
249273
* transition and invoke the PE
@@ -586,9 +610,7 @@ finalize_join_for(gpointer key, gpointer value, gpointer user_data)
586610
}
587611

588612
/* send the ack/nack to the node */
589-
acknak = create_request(CRM_OP_JOIN_ACKNAK, NULL, join_to,
590-
CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL);
591-
crm_xml_add_int(acknak, F_CRM_JOIN_ID, current_join_id);
613+
acknak = create_dc_message(CRM_OP_JOIN_ACKNAK, join_to);
592614

593615
crm_debug("join-%d: ACK'ing join request from %s",
594616
current_join_id, join_to);

crmd/messages.c

+6
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,12 @@ handle_request(xmlNode * stored_msg, enum crmd_fsa_cause cause)
870870

871871
} else {
872872
reap_crm_member(id, name);
873+
874+
/* If we're forgetting this node, also forget any failures to fence
875+
* it, so we don't carry that over to any node added later with the
876+
* same name.
877+
*/
878+
st_fail_count_reset(name);
873879
}
874880

875881
} else if (strcmp(op, CRM_OP_MAINTENANCE_NODES) == 0) {

crmd/te_actions.c

+4-8
Original file line numberDiff line numberDiff line change
@@ -726,15 +726,11 @@ notify_crmd(crm_graph_t * graph)
726726
case tg_restart:
727727
type = "restart";
728728
if (fsa_state == S_TRANSITION_ENGINE) {
729-
if (too_many_st_failures() == FALSE) {
730-
if (transition_timer->period_ms > 0) {
731-
crm_timer_stop(transition_timer);
732-
crm_timer_start(transition_timer);
733-
} else {
734-
event = I_PE_CALC;
735-
}
729+
if (transition_timer->period_ms > 0) {
730+
crm_timer_stop(transition_timer);
731+
crm_timer_start(transition_timer);
736732
} else {
737-
event = I_TE_SUCCESS;
733+
event = I_PE_CALC;
738734
}
739735

740736
} else if (fsa_state == S_POLICY_ENGINE) {

crmd/te_callbacks.c

+79-17
Original file line numberDiff line numberDiff line change
@@ -635,8 +635,8 @@ struct st_fail_rec {
635635
int count;
636636
};
637637

638-
gboolean
639-
too_many_st_failures(void)
638+
static gboolean
639+
too_many_st_failures(const char *target)
640640
{
641641
GHashTableIter iter;
642642
const char *key = NULL;
@@ -646,32 +646,63 @@ too_many_st_failures(void)
646646
return FALSE;
647647
}
648648

649-
g_hash_table_iter_init(&iter, stonith_failures);
650-
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
651-
if (value->count > stonith_max_attempts ) {
652-
crm_warn("Too many failures to fence %s (%d), giving up", key, value->count);
653-
return TRUE;
649+
if (target == NULL) {
650+
g_hash_table_iter_init(&iter, stonith_failures);
651+
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
652+
if (value->count >= stonith_max_attempts) {
653+
target = (const char*)key;
654+
goto too_many;
655+
}
656+
}
657+
} else {
658+
value = g_hash_table_lookup(stonith_failures, target);
659+
if ((value != NULL) && (value->count >= stonith_max_attempts)) {
660+
goto too_many;
654661
}
655662
}
656663
return FALSE;
664+
665+
too_many:
666+
crm_warn("Too many failures (%d) to fence %s, giving up",
667+
value->count, target);
668+
return TRUE;
657669
}
658670

671+
/*!
672+
* \internal
673+
* \brief Reset a stonith fail count
674+
*
675+
* \param[in] target Name of node to reset, or NULL for all
676+
*/
659677
void
660678
st_fail_count_reset(const char *target)
661679
{
662-
struct st_fail_rec *rec = NULL;
663-
664-
if (stonith_failures) {
665-
rec = g_hash_table_lookup(stonith_failures, target);
680+
if (stonith_failures == NULL) {
681+
return;
666682
}
667683

668-
if (rec) {
669-
rec->count = 0;
684+
if (target) {
685+
struct st_fail_rec *rec = NULL;
686+
687+
rec = g_hash_table_lookup(stonith_failures, target);
688+
if (rec) {
689+
rec->count = 0;
690+
}
691+
} else {
692+
GHashTableIter iter;
693+
const char *key = NULL;
694+
struct st_fail_rec *rec = NULL;
695+
696+
g_hash_table_iter_init(&iter, stonith_failures);
697+
while (g_hash_table_iter_next(&iter, (gpointer *) &key,
698+
(gpointer *) &rec)) {
699+
rec->count = 0;
700+
}
670701
}
671702
}
672703

673-
static void
674-
st_fail_count_increment(const char *target, int rc)
704+
void
705+
st_fail_count_increment(const char *target)
675706
{
676707
struct st_fail_rec *rec = NULL;
677708

@@ -694,6 +725,27 @@ st_fail_count_increment(const char *target, int rc)
694725
}
695726
}
696727

728+
/*!
729+
* \internal
730+
* \brief Abort transition due to stonith failure
731+
*
732+
* \param[in] abort_action Whether to restart or stop transition
733+
* \param[in] target Don't restart if this (NULL for any) has too many failures
734+
* \param[in] reason Log this stonith action XML as abort reason (or NULL)
735+
*/
736+
void
737+
abort_for_stonith_failure(enum transition_action abort_action,
738+
const char *target, xmlNode *reason)
739+
{
740+
/* If stonith repeatedly fails, we eventually give up on starting a new
741+
* transition for that reason.
742+
*/
743+
if ((abort_action != tg_stop) && too_many_st_failures(target)) {
744+
abort_action = tg_stop;
745+
}
746+
abort_transition(INFINITY, abort_action, "Stonith failed", reason);
747+
}
748+
697749
void
698750
tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
699751
{
@@ -755,12 +807,22 @@ tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
755807

756808
} else {
757809
const char *target = crm_element_value_const(action->xml, XML_LRM_ATTR_TARGET);
810+
enum transition_action abort_action = tg_restart;
758811

759812
action->failed = TRUE;
760813
crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
761814
call_id, target, pcmk_strerror(rc));
762-
abort_transition(INFINITY, tg_restart, "Stonith failed", NULL);
763-
st_fail_count_increment(target, rc);
815+
816+
/* If no fence devices were available, there's no use in immediately
817+
* checking again, so don't start a new transition in that case.
818+
*/
819+
if (rc == -ENODEV) {
820+
crm_warn("No devices found in cluster to fence %s, giving up",
821+
target);
822+
abort_action = tg_stop;
823+
}
824+
825+
abort_for_stonith_failure(abort_action, target, NULL);
764826
}
765827

766828
update_graph(transition_graph, action);

crmd/te_utils.c

+12-7
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ fail_incompletable_stonith(crm_graph_t * graph)
162162

163163
if (last_action != NULL) {
164164
crm_warn("STONITHd failure resulted in un-runnable actions");
165-
abort_transition(INFINITY, tg_restart, "Stonith failure", last_action);
165+
abort_for_stonith_failure(tg_restart, NULL, last_action);
166166
return TRUE;
167167
}
168168

@@ -259,17 +259,22 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
259259
return;
260260
}
261261

262-
if (st_event->result == pcmk_ok &&
263-
safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
264-
st_fail_count_reset(st_event->target);
262+
if (safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
263+
if (st_event->result == pcmk_ok) {
264+
st_fail_count_reset(st_event->target);
265+
} else {
266+
st_fail_count_increment(st_event->target);
267+
}
265268
}
266269

267-
crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s",
270+
crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s "
271+
CRM_XS " initiator=%s ref=%s",
268272
st_event->target, st_event->result == pcmk_ok ? "" : " not",
269273
st_event->action,
270274
st_event->executioner ? st_event->executioner : "<anyone>",
271-
st_event->origin, pcmk_strerror(st_event->result), st_event->id,
272-
st_event->client_origin ? st_event->client_origin : "<unknown>");
275+
(st_event->client_origin? st_event->client_origin : "<unknown>"),
276+
pcmk_strerror(st_event->result),
277+
st_event->origin, st_event->id);
273278

274279
#if SUPPORT_CMAN
275280
if (st_event->result == pcmk_ok && is_cman_cluster()) {

0 commit comments

Comments
 (0)