@@ -635,8 +635,8 @@ struct st_fail_rec {
635
635
int count ;
636
636
};
637
637
638
- gboolean
639
- too_many_st_failures (void )
638
+ static gboolean
639
+ too_many_st_failures (const char * target )
640
640
{
641
641
GHashTableIter iter ;
642
642
const char * key = NULL ;
@@ -646,32 +646,63 @@ too_many_st_failures(void)
646
646
return FALSE;
647
647
}
648
648
649
- g_hash_table_iter_init (& iter , stonith_failures );
650
- while (g_hash_table_iter_next (& iter , (gpointer * ) & key , (gpointer * ) & value )) {
651
- if (value -> count > stonith_max_attempts ) {
652
- crm_warn ("Too many failures to fence %s (%d), giving up" , key , value -> count );
653
- return TRUE;
649
+ if (target == NULL ) {
650
+ g_hash_table_iter_init (& iter , stonith_failures );
651
+ while (g_hash_table_iter_next (& iter , (gpointer * ) & key , (gpointer * ) & value )) {
652
+ if (value -> count >= stonith_max_attempts ) {
653
+ target = (const char * )key ;
654
+ goto too_many ;
655
+ }
656
+ }
657
+ } else {
658
+ value = g_hash_table_lookup (stonith_failures , target );
659
+ if ((value != NULL ) && (value -> count >= stonith_max_attempts )) {
660
+ goto too_many ;
654
661
}
655
662
}
656
663
return FALSE;
664
+
665
+ too_many :
666
+ crm_warn ("Too many failures (%d) to fence %s, giving up" ,
667
+ value -> count , target );
668
+ return TRUE;
657
669
}
658
670
671
+ /*!
672
+ * \internal
673
+ * \brief Reset a stonith fail count
674
+ *
675
+ * \param[in] target Name of node to reset, or NULL for all
676
+ */
659
677
void
660
678
st_fail_count_reset (const char * target )
661
679
{
662
- struct st_fail_rec * rec = NULL ;
663
-
664
- if (stonith_failures ) {
665
- rec = g_hash_table_lookup (stonith_failures , target );
680
+ if (stonith_failures == NULL ) {
681
+ return ;
666
682
}
667
683
668
- if (rec ) {
669
- rec -> count = 0 ;
684
+ if (target ) {
685
+ struct st_fail_rec * rec = NULL ;
686
+
687
+ rec = g_hash_table_lookup (stonith_failures , target );
688
+ if (rec ) {
689
+ rec -> count = 0 ;
690
+ }
691
+ } else {
692
+ GHashTableIter iter ;
693
+ const char * key = NULL ;
694
+ struct st_fail_rec * rec = NULL ;
695
+
696
+ g_hash_table_iter_init (& iter , stonith_failures );
697
+ while (g_hash_table_iter_next (& iter , (gpointer * ) & key ,
698
+ (gpointer * ) & rec )) {
699
+ rec -> count = 0 ;
700
+ }
670
701
}
671
702
}
672
703
673
- static void
674
- st_fail_count_increment (const char * target , int rc )
704
+ void
705
+ st_fail_count_increment (const char * target )
675
706
{
676
707
struct st_fail_rec * rec = NULL ;
677
708
@@ -694,6 +725,27 @@ st_fail_count_increment(const char *target, int rc)
694
725
}
695
726
}
696
727
728
+ /*!
729
+ * \internal
730
+ * \brief Abort transition due to stonith failure
731
+ *
732
+ * \param[in] abort_action Whether to restart or stop transition
733
+ * \param[in] target Don't restart if this (NULL for any) has too many failures
734
+ * \param[in] reason Log this stonith action XML as abort reason (or NULL)
735
+ */
736
+ void
737
+ abort_for_stonith_failure (enum transition_action abort_action ,
738
+ const char * target , xmlNode * reason )
739
+ {
740
+ /* If stonith repeatedly fails, we eventually give up on starting a new
741
+ * transition for that reason.
742
+ */
743
+ if ((abort_action != tg_stop ) && too_many_st_failures (target )) {
744
+ abort_action = tg_stop ;
745
+ }
746
+ abort_transition (INFINITY , abort_action , "Stonith failed" , reason );
747
+ }
748
+
697
749
void
698
750
tengine_stonith_callback (stonith_t * stonith , stonith_callback_data_t * data )
699
751
{
@@ -755,12 +807,22 @@ tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
755
807
756
808
} else {
757
809
const char * target = crm_element_value_const (action -> xml , XML_LRM_ATTR_TARGET );
810
+ enum transition_action abort_action = tg_restart ;
758
811
759
812
action -> failed = TRUE;
760
813
crm_notice ("Stonith operation %d for %s failed (%s): aborting transition." ,
761
814
call_id , target , pcmk_strerror (rc ));
762
- abort_transition (INFINITY , tg_restart , "Stonith failed" , NULL );
763
- st_fail_count_increment (target , rc );
815
+
816
+ /* If no fence devices were available, there's no use in immediately
817
+ * checking again, so don't start a new transition in that case.
818
+ */
819
+ if (rc == - ENODEV ) {
820
+ crm_warn ("No devices found in cluster to fence %s, giving up" ,
821
+ target );
822
+ abort_action = tg_stop ;
823
+ }
824
+
825
+ abort_for_stonith_failure (abort_action , target , NULL );
764
826
}
765
827
766
828
update_graph (transition_graph , action );
0 commit comments