-
Notifications
You must be signed in to change notification settings - Fork 4k
Reconcile QQ node dead during delete and redeclare #14241
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
2c91964
60b5a13
a727682
82aa2ed
821101f
1a079fe
43cebed
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -268,9 +268,17 @@ start_cluster(Q) -> | |
| {LeaderNode, FollowerNodes} = | ||
| rabbit_queue_location:select_leader_and_followers(Q, QuorumSize), | ||
| LeaderId = {RaName, LeaderNode}, | ||
| UIDs = maps:from_list([{Node, ra:new_uid(ra_lib:to_binary(RaName))} | ||
| || Node <- [LeaderNode | FollowerNodes]]), | ||
| NewQ0 = amqqueue:set_pid(Q, LeaderId), | ||
| NewQ1 = amqqueue:set_type_state(NewQ0, | ||
| #{nodes => [LeaderNode | FollowerNodes]}), | ||
| NewQ1 = case rabbit_feature_flags:is_enabled(track_qq_members_uids) of | ||
| false -> | ||
| amqqueue:set_type_state(NewQ0, | ||
| #{nodes => [LeaderNode | FollowerNodes]}); | ||
| true -> | ||
| amqqueue:set_type_state(NewQ0, | ||
| #{nodes => UIDs}) | ||
| end, | ||
|
|
||
| Versions = [V || {ok, V} <- erpc:multicall(FollowerNodes, | ||
| rabbit_fifo, version, [], | ||
|
|
@@ -716,7 +724,7 @@ repair_amqqueue_nodes(Q0) -> | |
| {Name, _} = amqqueue:get_pid(Q0), | ||
| Members = ra_leaderboard:lookup_members(Name), | ||
| RaNodes = [N || {_, N} <- Members], | ||
| #{nodes := Nodes} = amqqueue:get_type_state(Q0), | ||
| Nodes = get_nodes(Q0), | ||
| case lists:sort(RaNodes) =:= lists:sort(Nodes) of | ||
| true -> | ||
| %% up to date | ||
|
|
@@ -725,7 +733,16 @@ repair_amqqueue_nodes(Q0) -> | |
| %% update amqqueue record | ||
| Fun = fun (Q) -> | ||
| TS0 = amqqueue:get_type_state(Q), | ||
| TS = TS0#{nodes => RaNodes}, | ||
| TS = case rabbit_feature_flags:is_enabled(track_qq_members_uids) of | ||
| false -> | ||
LoisSotoLopez marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| TS0#{nodes => RaNodes}; | ||
| true -> | ||
| RaUids = maps:from_list([{N, erpc:call(N, ra_directory, uid_of, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, in I feel like an I don't think that would be something that should happen on a normal scenario without some major error happening in some earlier execution phase. But agree, we could maybe log with a warning if |
||
| [?RA_SYSTEM, Name], | ||
| ?RPC_TIMEOUT)} | ||
| || N <- RaNodes]), | ||
| TS0#{nodes => RaUids} | ||
| end, | ||
| amqqueue:set_type_state(Q, TS) | ||
| end, | ||
| _ = rabbit_amqqueue:update(QName, Fun), | ||
|
|
@@ -784,11 +801,28 @@ maybe_apply_policies(Q, #{config := CurrentConfig}) -> | |
| {[amqqueue:amqqueue()], [amqqueue:amqqueue()]}. | ||
| recover(_Vhost, Queues) -> | ||
| lists:foldl( | ||
| fun (Q0, {R0, F0}) -> | ||
| {Name, _} = amqqueue:get_pid(Q0), | ||
| fun (Q, {R0, F0}) -> | ||
| {Name, _} = amqqueue:get_pid(Q), | ||
| ServerId = {Name, node()}, | ||
| QName = amqqueue:get_name(Q0), | ||
| MutConf = make_mutable_config(Q0), | ||
| QName = amqqueue:get_name(Q), | ||
| MutConf = make_mutable_config(Q), | ||
| RaUId = ra_directory:uid_of(?RA_SYSTEM, Name), | ||
LoisSotoLopez marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Also here then. |
||
| #{nodes := Nodes} = amqqueue:get_type_state(Q), | ||
| case Nodes of | ||
| List when is_list(List) -> | ||
| %% Queue is not aware of node to uid mapping, do nothing | ||
| ok; | ||
| #{node() := RaUId} -> | ||
| %% Queue is aware and uid for current node is correct, do | ||
| %% nothing | ||
| ok; | ||
| #{node() := _NewRaUId} -> | ||
| %% Queue is aware but it does not match the one returned by | ||
| %% ra_directory | ||
| rabbit_log:info("Quorum queue ~ts: detected node uuid change, " | ||
| "deleting old data directory", [rabbit_misc:rs(QName)]), | ||
| maybe_delete_data_dir(RaUId) | ||
| end, | ||
| Res = case ra:restart_server(?RA_SYSTEM, ServerId, MutConf) of | ||
| ok -> | ||
| % queue was restarted, good | ||
|
|
@@ -801,7 +835,7 @@ recover(_Vhost, Queues) -> | |
| [rabbit_misc:rs(QName), Err1]), | ||
| % queue was never started on this node | ||
| % so needs to be started from scratch. | ||
| case start_server(make_ra_conf(Q0, ServerId)) of | ||
| case start_server(make_ra_conf(Q, ServerId)) of | ||
| ok -> ok; | ||
| Err2 -> | ||
| ?LOG_WARNING("recover: quorum queue ~w could not" | ||
|
|
@@ -823,8 +857,7 @@ recover(_Vhost, Queues) -> | |
| %% present in the rabbit_queue table and not just in | ||
| %% rabbit_durable_queue | ||
| %% So many code paths are dependent on this. | ||
| ok = rabbit_db_queue:set_dirty(Q0), | ||
| Q = Q0, | ||
| ok = rabbit_db_queue:set_dirty(Q), | ||
LoisSotoLopez marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| case Res of | ||
| ok -> | ||
| {[Q | R0], F0}; | ||
|
|
@@ -1204,12 +1237,17 @@ cleanup_data_dir() -> | |
| maybe_delete_data_dir(UId) -> | ||
| _ = ra_directory:unregister_name(?RA_SYSTEM, UId), | ||
| Dir = ra_env:server_data_dir(?RA_SYSTEM, UId), | ||
| {ok, Config} = ra_log:read_config(Dir), | ||
| case maps:get(machine, Config) of | ||
| {module, rabbit_fifo, _} -> | ||
| ra_lib:recursive_delete(Dir); | ||
| _ -> | ||
| ok | ||
| case filelib:is_dir(Dir) of | ||
| false -> | ||
| ok; | ||
| true -> | ||
| {ok, Config} = ra_log:read_config(Dir), | ||
| case maps:get(machine, Config) of | ||
| {module, rabbit_fifo, _} -> | ||
| ra_lib:recursive_delete(Dir); | ||
| _ -> | ||
| ok | ||
| end | ||
| end. | ||
|
|
||
| policy_changed(Q) -> | ||
|
|
@@ -1374,16 +1412,29 @@ add_member(Q, Node, Membership) -> | |
| do_add_member(Q, Node, Membership, ?MEMBER_CHANGE_TIMEOUT). | ||
|
|
||
|
|
||
| do_add_member(Q, Node, Membership, Timeout) | ||
| when ?is_amqqueue(Q) andalso | ||
| ?amqqueue_is_quorum(Q) andalso | ||
| do_add_member(Q0, Node, Membership, Timeout) | ||
| when ?is_amqqueue(Q0) andalso | ||
| ?amqqueue_is_quorum(Q0) andalso | ||
| is_atom(Node) -> | ||
| {RaName, _} = amqqueue:get_pid(Q), | ||
| QName = amqqueue:get_name(Q), | ||
| {RaName, _} = amqqueue:get_pid(Q0), | ||
| QName = amqqueue:get_name(Q0), | ||
| %% TODO parallel calls might crash this, or add a duplicate in quorum_nodes | ||
| ServerId = {RaName, Node}, | ||
| Members = members(Q), | ||
|
|
||
| Members = members(Q0), | ||
| QTypeState0 = #{nodes := Nodes} = amqqueue:get_type_state(Q0), | ||
| NewRaUId = ra:new_uid(ra_lib:to_binary(RaName)), | ||
| QTypeState = case Nodes of | ||
LoisSotoLopez marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| L when is_list(L) -> | ||
| %% Queue is not aware of node to uid mapping, just add the new node | ||
| QTypeState0#{nodes => lists:usort([Node | Nodes])}; | ||
| #{Node := _} -> | ||
| %% Queue is aware and uid for targeted node exists, do nothing | ||
| QTypeState0; | ||
| _ -> | ||
| %% Queue is aware but current node has no UId, regen uid | ||
| QTypeState0#{nodes => Nodes#{Node => NewRaUId}} | ||
| end, | ||
| Q = amqqueue:set_type_state(Q0, QTypeState), | ||
| MachineVersion = erpc_call(Node, rabbit_fifo, version, [], infinity), | ||
| Conf = make_ra_conf(Q, ServerId, Membership, MachineVersion), | ||
| case ra:start_server(?RA_SYSTEM, Conf) of | ||
|
|
@@ -1393,8 +1444,12 @@ do_add_member(Q, Node, Membership, Timeout) | |
| {ok, {RaIndex, RaTerm}, Leader} -> | ||
| Fun = fun(Q1) -> | ||
| Q2 = update_type_state( | ||
| Q1, fun(#{nodes := Nodes} = Ts) -> | ||
| Ts#{nodes => lists:usort([Node | Nodes])} | ||
| Q1, fun(#{nodes := NodesList} = Ts) when is_list(NodesList) -> | ||
| Ts#{nodes => lists:usort([Node | NodesList])}; | ||
| (#{nodes := #{Node := _}} = Ts) -> | ||
| Ts; | ||
| (#{nodes := NodesMap} = Ts) when is_map(NodesMap) -> | ||
| Ts#{nodes => maps:put(Node, NewRaUId, NodesMap)} | ||
| end), | ||
| amqqueue:set_pid(Q2, Leader) | ||
LoisSotoLopez marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| end, | ||
|
|
@@ -1467,8 +1522,10 @@ delete_member(Q, Node) when ?amqqueue_is_quorum(Q) -> | |
| Fun = fun(Q1) -> | ||
| update_type_state( | ||
| Q1, | ||
| fun(#{nodes := Nodes} = Ts) -> | ||
| Ts#{nodes => lists:delete(Node, Nodes)} | ||
| fun(#{nodes := Nodes} = Ts) when is_list(Nodes) -> | ||
| Ts#{nodes => lists:delete(Node, Nodes)}; | ||
| (#{nodes := Nodes} = Ts) when is_map(Nodes) -> | ||
| Ts#{nodes => maps:remove(Node, Nodes)} | ||
| end) | ||
| end, | ||
| _ = rabbit_amqqueue:update(QName, Fun), | ||
|
|
@@ -1988,7 +2045,15 @@ make_ra_conf(Q, ServerId, TickTimeout, | |
| #resource{name = QNameBin} = QName, | ||
| RaMachine = ra_machine(Q), | ||
| [{ClusterName, _} | _] = Members = members(Q), | ||
| UId = ra:new_uid(ra_lib:to_binary(ClusterName)), | ||
| {_, Node} = ServerId, | ||
| UId = case amqqueue:get_type_state(Q) of | ||
LoisSotoLopez marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| #{nodes := #{Node := Id}} -> | ||
| Id; | ||
| _ -> | ||
| %% Queue was declared on an older version of RabbitMQ | ||
| %% or does not have the node to uid mappings | ||
| ra:new_uid(ra_lib:to_binary(ClusterName)) | ||
| end, | ||
| FName = rabbit_misc:rs(QName), | ||
| Formatter = {?MODULE, format_ra_event, [QName]}, | ||
| LogCfg = #{uid => UId, | ||
|
|
@@ -2020,7 +2085,12 @@ make_mutable_config(Q) -> | |
|
|
||
| get_nodes(Q) when ?is_amqqueue(Q) -> | ||
| #{nodes := Nodes} = amqqueue:get_type_state(Q), | ||
| Nodes. | ||
| case Nodes of | ||
| List when is_list(List) -> | ||
| List; | ||
| Map when is_map(Map) -> | ||
| maps:keys(Map) | ||
| end. | ||
|
|
||
| get_connected_nodes(Q) when ?is_amqqueue(Q) -> | ||
| ErlangNodes = [node() | nodes()], | ||
|
|
@@ -2127,7 +2197,7 @@ force_checkpoint_on_queue(QName) -> | |
| {ok, Q} when ?amqqueue_is_quorum(Q) -> | ||
| {RaName, _} = amqqueue:get_pid(Q), | ||
| ?LOG_DEBUG("Sending command to force ~ts to take a checkpoint", [QNameFmt]), | ||
| Nodes = amqqueue:get_nodes(Q), | ||
| Nodes = rabbit_queue_type:get_nodes(Q), | ||
| _ = [ra:cast_aux_command({RaName, Node}, force_checkpoint) | ||
| || Node <- Nodes], | ||
| ok; | ||
|
|
@@ -2322,7 +2392,7 @@ transfer_leadership(_CandidateNodes) -> | |
| %% wait for leader elections before processing next chunk of queues | ||
| [begin | ||
| {RaName, LeaderNode} = amqqueue:get_pid(Q), | ||
| MemberNodes = lists:delete(LeaderNode, amqqueue:get_nodes(Q)), | ||
| MemberNodes = lists:delete(LeaderNode, rabbit_queue_type:get_nodes(Q)), | ||
| %% we don't do any explicit error handling here as it is more | ||
| %% important to make progress | ||
| _ = lists:any(fun (N) -> | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.