Skip to content

Commit 26c859c

Browse files
authored
HDDS-12204. Improve failover logging (apache#7867)
1 parent efbf79c commit 26c859c

File tree

7 files changed

+29
-24
lines changed

7 files changed

+29
-24
lines changed

hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/ServerNotLeaderException.java

+9-8
Original file line numberDiff line numberDiff line change
@@ -30,20 +30,21 @@
3030
public class ServerNotLeaderException extends IOException {
3131
private final String leader;
3232
private static final Pattern CURRENT_PEER_ID_PATTERN =
33-
Pattern.compile("Server:(.*) is not the leader[.]+.*", Pattern.DOTALL);
33+
Pattern.compile(".* Server:(.*?) is not the leader[.]+.*", Pattern.DOTALL);
3434
private static final Pattern SUGGESTED_LEADER_PATTERN =
3535
Pattern.compile(".*Suggested leader is Server:([^:]*)(:[0-9]+).*",
3636
Pattern.DOTALL);
3737

38-
public ServerNotLeaderException(RaftPeerId currentPeerId) {
39-
super("Server:" + currentPeerId + " is not the leader. Could not " +
38+
public ServerNotLeaderException(RaftPeerId currentPeerId, String hostname,
39+
String roleType) {
40+
super(roleType + " Server:" + currentPeerId + "(" + hostname + ") is not the leader. Could not " +
4041
"determine the leader node.");
4142
this.leader = null;
4243
}
4344

4445
public ServerNotLeaderException(RaftPeerId currentPeerId,
45-
String suggestedLeader) {
46-
super("Server:" + currentPeerId + " is not the leader. Suggested leader is"
46+
String suggestedLeader, String hostname, String roleType) {
47+
super(roleType + " Server:" + currentPeerId + "(" + hostname + ") is not the leader. Suggested leader is"
4748
+ " Server:" + suggestedLeader + ".");
4849
this.leader = suggestedLeader;
4950
}
@@ -90,7 +91,7 @@ public String getSuggestedLeader() {
9091
*/
9192
public static ServerNotLeaderException convertToNotLeaderException(
9293
NotLeaderException notLeaderException,
93-
RaftPeerId currentPeer, String port) {
94+
RaftPeerId currentPeer, String port, String hostname, String roleType) {
9495
String suggestedLeader = notLeaderException.getSuggestedLeader() != null ?
9596
HddsUtils
9697
.getHostName(notLeaderException.getSuggestedLeader().getAddress())
@@ -100,9 +101,9 @@ public static ServerNotLeaderException convertToNotLeaderException(
100101
if (suggestedLeader != null) {
101102
String suggestedLeaderHostPort = suggestedLeader + ":" + port;
102103
serverNotLeaderException =
103-
new ServerNotLeaderException(currentPeer, suggestedLeaderHostPort);
104+
new ServerNotLeaderException(currentPeer, suggestedLeaderHostPort, hostname, roleType);
104105
} else {
105-
serverNotLeaderException = new ServerNotLeaderException(currentPeer);
106+
serverNotLeaderException = new ServerNotLeaderException(currentPeer, hostname, roleType);
106107
}
107108
return serverNotLeaderException;
108109
}

hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/TestServerNotLeaderExceptionMessageParsing.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@ public void testServerNotLeaderException() {
2929

3030
// Test hostname with "."
3131
final String msg =
32-
"Server:cf0bc565-a41b-4784-a24d-3048d5a5b013 is not the leader. "
32+
"SCM Server:cf0bc565-a41b-4784-a24d-3048d5a5b013(172.16.102.111) is not the leader. "
3333
+ "Suggested leader is Server:scm5-3.scm5.root.hwx.site:9863";
3434
ServerNotLeaderException snle = new ServerNotLeaderException(msg);
3535
assertEquals(snle.getSuggestedLeader(), "scm5-3.scm5.root.hwx" +
3636
".site:9863");
3737

38-
String message = "Server:7fdd7170-75cc-4e11-b343-c2657c2f2f39 is not the " +
38+
String message = "SCM Server:7fdd7170-75cc-4e11-b343-c2657c2f2f39(172.16.102.111) is not the " +
3939
"leader.Suggested leader is Server:scm5-3.scm5.root.hwx.site:9863 \n" +
4040
"at org.apache.hadoop.hdds.ratis.ServerNotLeaderException" +
4141
".convertToNotLeaderException(ServerNotLeaderException.java:96)";
@@ -44,30 +44,30 @@ public void testServerNotLeaderException() {
4444
snle.getSuggestedLeader());
4545

4646
// Test hostname with out "."
47-
message = "Server:7fdd7170-75cc-4e11-b343-c2657c2f2f39 is not the " +
47+
message = "SCM Server:7fdd7170-75cc-4e11-b343-c2657c2f2f39(172.16.102.111) is not the " +
4848
"leader.Suggested leader is Server:localhost:98634 \n" +
4949
"at org.apache.hadoop.hdds.ratis.ServerNotLeaderException" +
5050
".convertToNotLeaderException(ServerNotLeaderException.java:96)";
5151
snle = new ServerNotLeaderException(message);
5252
assertEquals("localhost:98634",
5353
snle.getSuggestedLeader());
5454

55-
message = "Server:7fdd7170-75cc-4e11-b343-c2657c2f2f39 is not the " +
55+
message = "SCM Server:7fdd7170-75cc-4e11-b343-c2657c2f2f39(172.16.102.111) is not the " +
5656
"leader.Suggested leader is Server::98634 \n" +
5757
"at org.apache.hadoop.hdds.ratis.ServerNotLeaderException" +
5858
".convertToNotLeaderException(ServerNotLeaderException.java:96)";
5959
snle = new ServerNotLeaderException(message);
6060
assertNull(snle.getSuggestedLeader());
6161

62-
message = "Server:7fdd7170-75cc-4e11-b343-c2657c2f2f39 is not the " +
62+
message = "SCM Server:7fdd7170-75cc-4e11-b343-c2657c2f2f39(172.16.102.111) is not the " +
6363
"leader.Suggested leader is Server:localhost:98634:8988 \n" +
6464
"at org.apache.hadoop.hdds.ratis.ServerNotLeaderException" +
6565
".convertToNotLeaderException(ServerNotLeaderException.java:96)";
6666
snle = new ServerNotLeaderException(message);
6767
assertEquals("localhost:98634",
6868
snle.getSuggestedLeader());
6969

70-
message = "Server:7fdd7170-75cc-4e11-b343-c2657c2f2f39 is not the " +
70+
message = "SCM Server:7fdd7170-75cc-4e11-b343-c2657c2f2f39(172.16.102.111) is not the " +
7171
"leader.Suggested leader is Server:localhost \n" +
7272
"at org.apache.hadoop.hdds.ratis.ServerNotLeaderException" +
7373
".convertToNotLeaderException(ServerNotLeaderException.java)";

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ private static void setRaftSnapshotProperties(
235235
}
236236

237237
public static void checkRatisException(IOException e, String port,
238-
String scmId) throws ServiceException {
238+
String scmId, String hostname, String roleType) throws ServiceException {
239239
if (SCMHAUtils.isNonRetriableException(e)) {
240240
throw new ServiceException(new NonRetriableException(e));
241241
} else if (SCMHAUtils.isRetriableWithNoFailoverException(e)) {
@@ -245,7 +245,7 @@ public static void checkRatisException(IOException e, String port,
245245
(NotLeaderException) SCMHAUtils.getNotLeaderException(e);
246246
throw new ServiceException(ServerNotLeaderException
247247
.convertToNotLeaderException(nle,
248-
SCMRatisServerImpl.getSelfPeerId(scmId), port));
248+
SCMRatisServerImpl.getSelfPeerId(scmId), port, hostname, roleType));
249249
} else if (e instanceof SCMSecurityException) {
250250
// For NOT_A_PRIMARY_SCM error client needs to retry on next SCM.
251251
// GetSCMCertificate call can happen on non-leader SCM and only an

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/SCMSecurityProtocolServerSideTranslatorPB.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ public class SCMSecurityProtocolServerSideTranslatorPB
6060

6161
private final SCMSecurityProtocol impl;
6262
private final StorageContainerManager scm;
63+
private static final String ROLE_TYPE = "SCM";
6364

6465
private OzoneProtocolMessageDispatcher<SCMSecurityRequest,
6566
SCMSecurityResponse, ProtocolMessageEnum>
@@ -81,7 +82,7 @@ public SCMSecurityResponse submitRequest(RpcController controller,
8182
if (!scm.checkLeader()) {
8283
RatisUtil.checkRatisException(
8384
scm.getScmHAManager().getRatisServer().triggerNotLeaderException(),
84-
scm.getSecurityProtocolRpcPort(), scm.getScmId());
85+
scm.getSecurityProtocolRpcPort(), scm.getScmId(), scm.getHostname(), ROLE_TYPE);
8586
}
8687
return dispatcher.processRequest(request, this::processRequest,
8788
request.getCmdType(), request.getTraceID());
@@ -149,7 +150,7 @@ public SCMSecurityResponse processRequest(SCMSecurityRequest request)
149150
}
150151
} catch (IOException e) {
151152
RatisUtil.checkRatisException(e, scm.getSecurityProtocolRpcPort(),
152-
scm.getScmId());
153+
scm.getScmId(), scm.getHostname(), ROLE_TYPE);
153154
scmSecurityResponse.setSuccess(false);
154155
scmSecurityResponse.setStatus(exceptionToResponseStatus(e));
155156
// If actual cause is set in SCMSecurityException, set message with

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/ScmBlockLocationProtocolServerSideTranslatorPB.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ public final class ScmBlockLocationProtocolServerSideTranslatorPB
7070

7171
private final ScmBlockLocationProtocol impl;
7272
private final StorageContainerManager scm;
73+
private static final String ROLE_TYPE = "SCM";
7374

7475
private static final Logger LOG = LoggerFactory
7576
.getLogger(ScmBlockLocationProtocolServerSideTranslatorPB.class);
@@ -109,7 +110,7 @@ public SCMBlockLocationResponse send(RpcController controller,
109110
if (!scm.checkLeader()) {
110111
RatisUtil.checkRatisException(
111112
scm.getScmHAManager().getRatisServer().triggerNotLeaderException(),
112-
scm.getBlockProtocolRpcPort(), scm.getScmId());
113+
scm.getBlockProtocolRpcPort(), scm.getScmId(), scm.getHostname(), ROLE_TYPE);
113114
}
114115
return dispatcher.processRequest(
115116
request,
@@ -171,7 +172,7 @@ private SCMBlockLocationResponse processMessage(
171172
}
172173
} catch (IOException e) {
173174
RatisUtil.checkRatisException(e, scm.getBlockProtocolRpcPort(),
174-
scm.getScmId());
175+
scm.getScmId(), scm.getHostname(), ROLE_TYPE);
175176
response.setSuccess(false);
176177
response.setStatus(exceptionToResponseStatus(e));
177178
if (e.getMessage() != null) {

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/SecretKeyProtocolServerSideTranslatorPB.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ public class SecretKeyProtocolServerSideTranslatorPB
5555

5656
private final SecretKeyProtocolScm impl;
5757
private final StorageContainerManager scm;
58+
private static final String ROLE_TYPE = "SCM";
5859

5960
private OzoneProtocolMessageDispatcher<SCMSecretKeyRequest,
6061
SCMSecretKeyResponse, ProtocolMessageEnum> dispatcher;
@@ -75,7 +76,7 @@ public SCMSecretKeyResponse submitRequest(RpcController controller,
7576
if (!scm.checkLeader()) {
7677
RatisUtil.checkRatisException(
7778
scm.getScmHAManager().getRatisServer().triggerNotLeaderException(),
78-
scm.getSecurityProtocolRpcPort(), scm.getScmId());
79+
scm.getSecurityProtocolRpcPort(), scm.getScmId(), scm.getHostname(), ROLE_TYPE);
7980
}
8081
return dispatcher.processRequest(request, this::processRequest,
8182
request.getCmdType(), request.getTraceID());
@@ -115,7 +116,7 @@ public SCMSecretKeyResponse processRequest(SCMSecretKeyRequest request)
115116
}
116117
} catch (IOException e) {
117118
RatisUtil.checkRatisException(e, scm.getSecurityProtocolRpcPort(),
118-
scm.getScmId());
119+
scm.getScmId(), scm.getHostname(), ROLE_TYPE);
119120
scmSecurityResponse.setSuccess(false);
120121
scmSecurityResponse.setStatus(exceptionToResponseStatus(e));
121122
// If actual cause is set in SCMSecurityException, set message with

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ public final class StorageContainerLocationProtocolServerSideTranslatorPB
178178

179179
private final StorageContainerLocationProtocol impl;
180180
private final StorageContainerManager scm;
181+
private static final String ROLE_TYPE = "SCM";
181182

182183
private OzoneProtocolMessageDispatcher<ScmContainerLocationRequest,
183184
ScmContainerLocationResponse, ProtocolMessageEnum>
@@ -210,7 +211,7 @@ public ScmContainerLocationResponse submitRequest(RpcController controller,
210211
&& !ADMIN_COMMAND_TYPE.contains(request.getCmdType())) {
211212
RatisUtil.checkRatisException(
212213
scm.getScmHAManager().getRatisServer().triggerNotLeaderException(),
213-
scm.getClientRpcPort(), scm.getScmId());
214+
scm.getClientRpcPort(), scm.getScmId(), scm.getHostname(), ROLE_TYPE);
214215
}
215216
// After the request interceptor (now validator) framework is extended to
216217
// this server interface, this should be removed and solved via new
@@ -736,7 +737,7 @@ public ScmContainerLocationResponse processRequest(
736737
}
737738
} catch (IOException e) {
738739
RatisUtil
739-
.checkRatisException(e, scm.getClientRpcPort(), scm.getScmId());
740+
.checkRatisException(e, scm.getClientRpcPort(), scm.getScmId(), scm.getHostname(), ROLE_TYPE);
740741
throw new ServiceException(e);
741742
}
742743
}

0 commit comments

Comments
 (0)