8787import org .apache .cassandra .net .RequestCallback ;
8888import org .apache .cassandra .service .StorageService ;
8989import org .apache .cassandra .tcm .ClusterMetadata ;
90+ import org .apache .cassandra .tcm .Epoch ;
9091import org .apache .cassandra .tcm .membership .NodeId ;
9192import org .apache .cassandra .tcm .membership .NodeState ;
9293import org .apache .cassandra .utils .FBUtilities ;
106107import static org .apache .cassandra .gms .Gossiper .GossipedWith .CMS ;
107108import static org .apache .cassandra .gms .Gossiper .GossipedWith .SEED ;
108109import static org .apache .cassandra .gms .VersionedValue .BOOTSTRAPPING_STATUS ;
110+ import static org .apache .cassandra .gms .VersionedValue .HIBERNATE ;
111+ import static org .apache .cassandra .gms .VersionedValue .REMOVED_TOKEN ;
112+ import static org .apache .cassandra .gms .VersionedValue .REMOVING_TOKEN ;
113+ import static org .apache .cassandra .gms .VersionedValue .SHUTDOWN ;
114+ import static org .apache .cassandra .gms .VersionedValue .STATUS_LEFT ;
109115import static org .apache .cassandra .gms .VersionedValue .unsafeMakeVersionedValue ;
110116import static org .apache .cassandra .net .NoPayload .noPayload ;
111117import static org .apache .cassandra .net .Verb .ECHO_REQ ;
@@ -136,8 +142,7 @@ public class Gossiper implements IFailureDetectionEventListener, GossiperMBean,
136142 private static final ScheduledExecutorPlus executor = executorFactory ().scheduled ("GossipTasks" );
137143
138144 static final ApplicationState [] STATES = ApplicationState .values ();
139- static final List <String > DEAD_STATES = Arrays .asList (VersionedValue .REMOVING_TOKEN , VersionedValue .REMOVED_TOKEN ,
140- VersionedValue .STATUS_LEFT , VersionedValue .HIBERNATE );
145+ static final List <String > DEAD_STATES = Arrays .asList (REMOVING_TOKEN , REMOVED_TOKEN , STATUS_LEFT , HIBERNATE );
141146 static ArrayList <String > SILENT_SHUTDOWN_STATES = new ArrayList <>();
142147 static
143148 {
@@ -184,7 +189,15 @@ public class Gossiper implements IFailureDetectionEventListener, GossiperMBean,
184189 /* map where key is endpoint and value is timestamp when this endpoint was removed from
185190 * gossip. We will ignore any gossip regarding these endpoints for QUARANTINE_DELAY time
186191 * after removal to prevent nodes from falsely reincarnating during the time when removal
187- * gossip gets propagated to all nodes */
192+ * gossip gets propagated to all nodes.
193+ * Note: in future, this need only be used when ClusterMetadataService is in the GOSSIP state,
194+ * i.e. during the major upgrade to the version with CEP-21, but before the CMS is initialized.
195+ * In this state, gossip is still used to propagate changes to broadcast address and release
196+ * version. Once the CMS initialization is complete, this is no longer necessary.
197+ * Currently in order to support a controlled rollout of that change to behaviour, quarantine
198+ * is still used by default, but can be disabled via config (gossip_quarantine_disabled) or
199+ * JMX (GossiperMBean::setQuarantineDisabled)
200+ */
188201 private final Map <InetAddressAndPort , Long > justRemovedEndpoints = new ConcurrentHashMap <>();
189202
190203 private final Map <InetAddressAndPort , Long > expireTimeEndpointMap = new ConcurrentHashMap <>();
@@ -449,14 +462,7 @@ private static boolean isShutdown(EndpointState epState)
449462
450463 public static boolean isShutdown (VersionedValue vv )
451464 {
452- if (vv == null )
453- return false ;
454-
455- String value = vv .value ;
456- String [] pieces = value .split (VersionedValue .DELIMITER_STR , -1 );
457- assert (pieces .length > 0 );
458- String state = pieces [0 ];
459- return state .equals (VersionedValue .SHUTDOWN );
465+ return matchesStatusString (vv , SHUTDOWN );
460466 }
461467
462468 public static boolean isHibernate (EndpointState epState )
@@ -468,15 +474,39 @@ public static boolean isHibernate(EndpointState epState)
468474 }
469475
470476 public static boolean isHibernate (VersionedValue vv )
477+ {
478+ return matchesStatusString (vv , HIBERNATE );
479+ }
480+
481+ public static boolean isLeft (VersionedValue vv )
482+ {
483+ return matchesStatusString (vv , STATUS_LEFT );
484+ }
485+
486+ private static boolean matchesStatusString (VersionedValue vv , String toMatch )
471487 {
472488 if (vv == null )
473489 return false ;
474490
475- String value = vv .value ;
476- String [] pieces = value .split (VersionedValue .DELIMITER_STR , -1 );
491+ String [] pieces = vv .splitValue ();
477492 assert (pieces .length > 0 );
478493 String state = pieces [0 ];
479- return state .equals (VersionedValue .HIBERNATE );
494+ return state .equals (toMatch );
495+ }
496+
497+ public static long extractExpireTime (String [] pieces )
498+ {
499+ if (pieces .length < 3 )
500+ return 0L ;
501+ try
502+ {
503+ return Long .parseLong (pieces [2 ]);
504+ }
505+ catch (NumberFormatException e )
506+ {
507+ logger .debug ("Invalid value found for expire time ({}), ignoring" , pieces [2 ]);
508+ return 0L ;
509+ }
480510 }
481511
482512 public static void runInGossipStageBlocking (Runnable runnable )
@@ -695,10 +725,21 @@ private void quarantineEndpoint(InetAddressAndPort endpoint, long quarantineExpi
695725 {
696726 if (disableEndpointRemoval )
697727 return ;
728+
729+ // Quarantine is only necessary while upgrading from gossip-driven management of cluster metadata
730+ if (getQuarantineDisabled () && ClusterMetadata .current ().epoch .isAfter (Epoch .UPGRADE_GOSSIP ))
731+ return ;
732+
698733 justRemovedEndpoints .put (endpoint , quarantineExpiration );
699734 GossiperDiagnostics .quarantinedEndpoint (this , endpoint , quarantineExpiration );
700735 }
701736
737+ public void clearQuarantinedEndpoints ()
738+ {
739+ logger .info ("Clearing quarantined endpoints" );
740+ justRemovedEndpoints .clear ();
741+ }
742+
702743 /**
703744 * The gossip digest is built based on randomization
704745 * rather than just looping through the collection of live endpoints.
@@ -943,15 +984,14 @@ void doStatusCheck()
943984 }
944985
945986 // check for dead state removal
946- long expireTime = getExpireTimeForEndpoint (endpoint );
947- if (!epState .isAlive () && (now > expireTime )
948- && (!metadata .directory .allAddresses ().contains (endpoint )))
987+ if (!epState .isAlive () && (!metadata .directory .allJoinedEndpoints ().contains (endpoint )))
949988 {
950- if (logger .isDebugEnabled ())
989+ long expireTime = getExpireTimeForEndpoint (endpoint );
990+ if (now > expireTime )
951991 {
952- logger .debug ("time is expiring for endpoint : {} ({})" , endpoint , expireTime );
992+ logger .info ("Reached gossip expiry time for endpoint : {} ({})" , endpoint , expireTime );
993+ runInGossipStageBlocking (() -> evictFromMembership (endpoint ));
953994 }
954- runInGossipStageBlocking (() -> evictFromMembership (endpoint ));
955995 }
956996 }
957997 }
@@ -1892,11 +1932,15 @@ public int getCurrentGenerationNumber(String address) throws UnknownHostExceptio
18921932
18931933 public void addExpireTimeForEndpoint (InetAddressAndPort endpoint , long expireTime )
18941934 {
1895- if (logger .isDebugEnabled ())
1935+ if (expireTime == 0L )
1936+ {
1937+ logger .debug ("Supplied expire time for {} was 0, not recording" , endpoint );
1938+ }
1939+ else
18961940 {
18971941 logger .debug ("adding expire time for endpoint : {} ({})" , endpoint , expireTime );
1942+ expireTimeEndpointMap .put (endpoint , expireTime );
18981943 }
1899- expireTimeEndpointMap .put (endpoint , expireTime );
19001944 }
19011945
19021946 public static long computeExpireTime ()
@@ -2099,6 +2143,50 @@ public void unsafeSendLocalEndpointStateTo(InetAddressAndPort ep)
20992143 MessagingService .instance ().send (message , ep );
21002144 }
21012145
2146+ public void unsafeBroadcastLeftStatus (InetAddressAndPort left ,
2147+ Collection <Token > tokens ,
2148+ Iterable <InetAddressAndPort > sendTo )
2149+ {
2150+ runInGossipStageBlocking (() -> {
2151+ EndpointState epState = endpointStateMap .get (left );
2152+ if (epState == null )
2153+ {
2154+ logger .info ("No gossip state for node {}" , left );
2155+ return ;
2156+ }
2157+
2158+ NodeState state = ClusterMetadata .current ().directory .peerState (left );
2159+ if (state != NodeState .LEFT )
2160+ {
2161+ logger .info ("Node Status for {} is not LEFT ({})" , left , state );
2162+ return ;
2163+ }
2164+
2165+ EndpointState toSend = new EndpointState (epState );
2166+ toSend .forceNewerGenerationUnsafe ();
2167+ toSend .markDead ();
2168+ VersionedValue value = StorageService .instance .valueFactory .left (tokens , computeExpireTime ());
2169+
2170+ if (left .equals (getBroadcastAddressAndPort ()))
2171+ {
2172+ // Adding local state bumps the value's version. To keep this consistent across
2173+ // the cluster, re-fetch it before broadcasting.
2174+ Gossiper .instance .addLocalApplicationState (ApplicationState .STATUS_WITH_PORT , value );
2175+ value = Gossiper .instance .endpointStateMap .get (getBroadcastAddressAndPort ())
2176+ .getApplicationState (ApplicationState .STATUS_WITH_PORT );
2177+ }
2178+
2179+ toSend .addApplicationState (ApplicationState .STATUS_WITH_PORT , value );
2180+ GossipDigestAck2 payload = new GossipDigestAck2 (Collections .singletonMap (left , toSend ));
2181+ logger .info ("Sending app state with status {} to {}" , value .value , sendTo );
2182+ for (InetAddressAndPort ep : sendTo )
2183+ {
2184+ Message <GossipDigestAck2 > message = Message .out (Verb .GOSSIP_DIGEST_ACK2 , payload );
2185+ MessagingService .instance ().send (message , ep );
2186+ }
2187+ });
2188+ }
2189+
21022190 private void unsafeUpdateEpStates (InetAddressAndPort endpoint , EndpointState epstate )
21032191 {
21042192 checkProperThreadForStateMutation ();
@@ -2116,6 +2204,10 @@ private void unsafeUpdateEpStates(InetAddressAndPort endpoint, EndpointState eps
21162204 if (epstate .getHeartBeatState ().getGeneration () > 0 &&
21172205 (old == null || old .getHeartBeatState ().getGeneration () < epstate .getHeartBeatState ().getGeneration ()))
21182206 handleMajorStateChange (endpoint , epstate );
2207+
2208+ // mark dead if the supplied epstate said so
2209+ if (isDeadState (epstate ))
2210+ markDead (endpoint , old == null ? epstate : old );
21192211 }
21202212 }
21212213
@@ -2216,9 +2308,31 @@ private void mergeNodeToGossip(NodeId nodeId, ClusterMetadata metadata, Collecti
22162308 newValue = valueFactory .hibernate (true );
22172309 break ;
22182310 }
2311+
22192312 if (isLocal && !StorageService .instance .shouldJoinRing ())
22202313 break ;
2221- newValue = GossipHelper .nodeStateToStatus (nodeId , metadata , tokens , valueFactory , oldValue );
2314+
2315+ // If quarantine has been disabled and we have already seen a LEFT status for a remote peer
2316+ // which originated from the peer itself or the node which coordinated its removal (and so
2317+ // has a version > 0), keep it as this is how we ensure the gossip expiry time encoded in
2318+ // the status string converges across peers.
2319+ // Should a node leave and then rejoin after resetting its local state (i.e. wipe and
2320+ // rejoin), it is automatically unregistered which removes all gossip state for it so there
2321+ // will be no oldValue in that case.
2322+ //
2323+ // Note: don't reorder these conditions as isLeft includes a null check
2324+ if (getQuarantineDisabled () && !isLocal && Gossiper .isLeft (oldValue ) && oldValue .version > 0 )
2325+ {
2326+ logger .debug ("Already seen a LEFT status for {} with a non-zero version, " +
2327+ "dropping derived value {}" , endpoint , newValue );
2328+ newValue = oldValue ;
2329+ }
2330+ else
2331+ {
2332+ newValue = GossipHelper .nodeStateToStatus (nodeId , metadata , tokens , valueFactory , oldValue );
2333+ if (Gossiper .isLeft (newValue ))
2334+ Gossiper .instance .addExpireTimeForEndpoint (endpoint , Gossiper .extractExpireTime (newValue .splitValue ()));
2335+ }
22222336 break ;
22232337 default :
22242338 newValue = oldValue ;
@@ -2264,4 +2378,17 @@ public void triggerRoundWithCMS()
22642378 sendGossip (message , cms );
22652379 }
22662380 }
2381+
2382+ @ Override
2383+ public boolean getQuarantineDisabled ()
2384+ {
2385+ return DatabaseDescriptor .getGossipQuarantineDisabled ();
2386+ }
2387+
2388+ @ Override
2389+ public void setQuarantineDisabled (boolean enabled )
2390+ {
2391+ logger .info ("Setting gossip_quarantine_disabled: {}" , enabled );
2392+ DatabaseDescriptor .setGossipQuarantineDisabled (enabled );
2393+ }
22672394}
0 commit comments