Skip to content

Commit d95ca4c

Browse files
smengclsiddhantsangwanswamirishi
authored
HDDS-12421. ContainerReportHandler should not make the call to delete replicas (apache#7976)
Co-authored-by: Siddhant Sangwan <[email protected]> Co-authored-by: Swaminathan Balachandran <[email protected]>
1 parent 26c859c commit d95ca4c

15 files changed

+356
-154
lines changed

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java

+22-20
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ private boolean updateContainerState(final DatanodeDetails datanode,
254254

255255
final ContainerID containerId = container.containerID();
256256
boolean ignored = false;
257-
257+
boolean replicaIsEmpty = replica.hasIsEmpty() && replica.getIsEmpty();
258258
switch (container.getState()) {
259259
case OPEN:
260260
/*
@@ -360,27 +360,29 @@ private boolean updateContainerState(final DatanodeDetails datanode,
360360
* The container is already in closed state. do nothing.
361361
*/
362362
break;
363+
case DELETED:
364+
// If container is in DELETED state and the reported replica is empty, delete the empty replica.
365+
// We should also do this for DELETING containers and currently DeletingContainerHandler does that
366+
if (replicaIsEmpty) {
367+
deleteReplica(containerId, datanode, publisher, "DELETED", false);
368+
break;
369+
}
363370
case DELETING:
364371
/*
365-
HDDS-11136: If a DELETING container has a non-empty CLOSED replica, the container should be moved back to CLOSED
366-
state.
372+
* HDDS-11136: If a DELETING container has a non-empty CLOSED replica, the container should be moved back to
373+
* CLOSED state.
374+
*
375+
* HDDS-12421: If a DELETING or DELETED container has a non-empty replica, the container should also be moved
376+
* back to CLOSED state.
367377
*/
368-
boolean replicaStateAllowed = replica.getState() == State.CLOSED;
369-
boolean replicaNotEmpty = replica.hasIsEmpty() && !replica.getIsEmpty();
370-
if (replicaStateAllowed && replicaNotEmpty) {
371-
logger.info("Moving DELETING container {} to CLOSED state, datanode {} reported replica with state={}, " +
372-
"isEmpty={} and keyCount={}.", containerId, datanode.getHostName(), replica.getState(), false,
373-
replica.getKeyCount());
374-
containerManager.transitionDeletingToClosedState(containerId);
378+
boolean replicaStateAllowed = (replica.getState() != State.INVALID && replica.getState() != State.DELETED);
379+
if (!replicaIsEmpty && replicaStateAllowed) {
380+
logger.info("Moving container {} from {} to CLOSED state, datanode {} reported replica with state={}, " +
381+
"isEmpty={}, bcsId={}, keyCount={}, and origin={}",
382+
container, container.getState(), datanode.getHostName(), replica.getState(),
383+
replica.getIsEmpty(), replica.getBlockCommitSequenceId(), replica.getKeyCount(), replica.getOriginNodeId());
384+
containerManager.transitionDeletingOrDeletedToClosedState(containerId);
375385
}
376-
377-
break;
378-
case DELETED:
379-
/*
380-
* The container is deleted. delete the replica.
381-
*/
382-
deleteReplica(containerId, datanode, publisher, "DELETED");
383-
ignored = true;
384386
break;
385387
default:
386388
break;
@@ -462,9 +464,9 @@ protected ContainerManager getContainerManager() {
462464
}
463465

464466
protected void deleteReplica(ContainerID containerID, DatanodeDetails dn,
465-
EventPublisher publisher, String reason) {
467+
EventPublisher publisher, String reason, boolean force) {
466468
SCMCommand<?> command = new DeleteContainerCommand(
467-
containerID.getId(), true);
469+
containerID.getId(), force);
468470
try {
469471
command.setTerm(scmContext.getTermOfLeader());
470472
} catch (NotLeaderException nle) {

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -135,14 +135,14 @@ void updateContainerState(ContainerID containerID,
135135
throws IOException, InvalidStateTransitionException;
136136

137137
/**
138-
* Bypasses the container state machine to change a container's state from DELETING to CLOSED. This API was
138+
* Bypasses the container state machine to change a container's state from DELETING or DELETED to CLOSED. This API was
139139
* introduced to fix a bug (HDDS-11136), and should be used with care otherwise.
140140
*
141141
* @see <a href="https://issues.apache.org/jira/browse/HDDS-11136">HDDS-11136</a>
142142
* @param containerID id of the container to transition
143143
* @throws IOException
144144
*/
145-
void transitionDeletingToClosedState(ContainerID containerID) throws IOException;
145+
void transitionDeletingOrDeletedToClosedState(ContainerID containerID) throws IOException;
146146

147147
/**
148148
* Returns the latest list of replicas for given containerId.

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -290,12 +290,12 @@ public void updateContainerState(final ContainerID cid,
290290
}
291291

292292
@Override
293-
public void transitionDeletingToClosedState(ContainerID containerID) throws IOException {
293+
public void transitionDeletingOrDeletedToClosedState(ContainerID containerID) throws IOException {
294294
HddsProtos.ContainerID proto = containerID.getProtobuf();
295295
lock.lock();
296296
try {
297297
if (containerExist(containerID)) {
298-
containerStateManager.transitionDeletingToClosedState(proto);
298+
containerStateManager.transitionDeletingOrDeletedToClosedState(proto);
299299
} else {
300300
throwContainerNotFoundException(containerID);
301301
}

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ private void processSingleReplica(final DatanodeDetails datanodeDetails,
234234
UNKNOWN_CONTAINER_ACTION_DELETE)) {
235235
final ContainerID containerId = ContainerID
236236
.valueOf(replicaProto.getContainerID());
237-
deleteReplica(containerId, datanodeDetails, publisher, "unknown");
237+
deleteReplica(containerId, datanodeDetails, publisher, "unknown", true);
238238
}
239239
return;
240240
}

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManager.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -159,15 +159,15 @@ void updateContainerState(HddsProtos.ContainerID id,
159159

160160

161161
/**
162-
* Bypasses the container state machine to change a container's state from DELETING to CLOSED. This API was
162+
* Bypasses the container state machine to change a container's state from DELETING or DELETED to CLOSED. This API was
163163
* introduced to fix a bug (HDDS-11136), and should be used with care otherwise.
164164
*
165165
* @see <a href="https://issues.apache.org/jira/browse/HDDS-11136">HDDS-11136</a>
166166
* @param id id of the container to transition
167167
* @throws IOException
168168
*/
169169
@Replicate
170-
void transitionDeletingToClosedState(HddsProtos.ContainerID id) throws IOException;
170+
void transitionDeletingOrDeletedToClosedState(HddsProtos.ContainerID id) throws IOException;
171171

172172
/**
173173
*

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -370,16 +370,16 @@ public void updateContainerState(final HddsProtos.ContainerID containerID,
370370
}
371371

372372
@Override
373-
public void transitionDeletingToClosedState(HddsProtos.ContainerID containerID) throws IOException {
373+
public void transitionDeletingOrDeletedToClosedState(HddsProtos.ContainerID containerID) throws IOException {
374374
final ContainerID id = ContainerID.getFromProtobuf(containerID);
375375

376376
try (AutoCloseableLock ignored = writeLock(id)) {
377377
if (containers.contains(id)) {
378378
final ContainerInfo oldInfo = containers.getContainerInfo(id);
379379
final LifeCycleState oldState = oldInfo.getState();
380-
if (oldState != DELETING) {
380+
if (oldState != DELETING && oldState != DELETED) {
381381
throw new InvalidContainerStateException("Cannot transition container " + id + " from " + oldState +
382-
" back to CLOSED. The container must be in the DELETING state.");
382+
" back to CLOSED. The container must be in the DELETING or DELETED state.");
383383
}
384384
ExecutionUtil.create(() -> {
385385
containers.updateState(id, oldState, CLOSED);

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/DeletingContainerHandler.java

+2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
2424
import org.apache.hadoop.hdds.scm.container.ContainerID;
2525
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
26+
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
2627
import org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
2728
import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
2829
import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
@@ -88,6 +89,7 @@ public boolean handle(ContainerCheckRequest request) {
8889
//resend deleteCommand if needed
8990
request.getContainerReplicas().stream()
9091
.filter(r -> !pendingDelete.contains(r.getDatanodeDetails()))
92+
.filter(ContainerReplica::isEmpty)
9193
.forEach(rp -> {
9294
try {
9395
replicationManager.sendDeleteCommand(

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/states/ContainerStateMap.java

+6
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import static org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes.FAILED_TO_CHANGE_CONTAINER_STATE;
2121

22+
import com.google.common.annotations.VisibleForTesting;
2223
import com.google.common.base.Preconditions;
2324
import com.google.common.collect.ImmutableSet;
2425
import java.util.Collections;
@@ -93,6 +94,11 @@ public ContainerStateMap() {
9394
this.replicaMap = new ConcurrentHashMap<>();
9495
}
9596

97+
@VisibleForTesting
98+
public static Logger getLogger() {
99+
return LOG;
100+
}
101+
96102
/**
97103
* Adds a ContainerInfo Entry in the ContainerStateMap.
98104
*

hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerManagerImpl.java

+34-9
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
4343
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
4444
import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps;
45+
import org.apache.hadoop.hdds.scm.container.states.ContainerStateMap;
4546
import org.apache.hadoop.hdds.scm.ha.SCMHAManager;
4647
import org.apache.hadoop.hdds.scm.ha.SCMHAManagerStub;
4748
import org.apache.hadoop.hdds.scm.ha.SequenceIdGenerator;
@@ -53,10 +54,15 @@
5354
import org.apache.hadoop.hdds.utils.db.DBStoreBuilder;
5455
import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
5556
import org.apache.hadoop.ozone.container.common.SCMTestUtils;
57+
import org.apache.ozone.test.GenericTestUtils;
5658
import org.junit.jupiter.api.AfterEach;
59+
import org.junit.jupiter.api.BeforeAll;
5760
import org.junit.jupiter.api.BeforeEach;
5861
import org.junit.jupiter.api.Test;
5962
import org.junit.jupiter.api.io.TempDir;
63+
import org.junit.jupiter.params.ParameterizedTest;
64+
import org.junit.jupiter.params.provider.EnumSource;
65+
import org.slf4j.event.Level;
6066

6167
/**
6268
* Tests to verify the functionality of ContainerManager.
@@ -72,6 +78,12 @@ public class TestContainerManagerImpl {
7278
private NodeManager nodeManager;
7379
private ContainerReplicaPendingOps pendingOpsMock;
7480

81+
@BeforeAll
82+
static void init() {
83+
// Print container state transition logs
84+
GenericTestUtils.setLogLevel(ContainerStateMap.getLogger(), Level.TRACE);
85+
}
86+
7587
@BeforeEach
7688
void setUp() throws Exception {
7789
final OzoneConfiguration conf = SCMTestUtils.getConf(testDir);
@@ -131,9 +143,12 @@ void testUpdateContainerState() throws Exception {
131143
assertEquals(LifeCycleState.CLOSED, containerManager.getContainer(cid).getState());
132144
}
133145

134-
@Test
135-
void testTransitionDeletingToClosedState() throws IOException, InvalidStateTransitionException {
136-
// allocate OPEN Ratis and Ec containers, and do a series of state changes to transition them to DELETING
146+
@ParameterizedTest
147+
@EnumSource(value = HddsProtos.LifeCycleState.class,
148+
names = {"DELETING", "DELETED"})
149+
void testTransitionDeletingOrDeletedToClosedState(HddsProtos.LifeCycleState desiredState)
150+
throws IOException, InvalidStateTransitionException {
151+
// Allocate OPEN Ratis and Ec containers, and do a series of state changes to transition them to DELETING / DELETED
137152
final ContainerInfo container = containerManager.allocateContainer(
138153
RatisReplicationConfig.getInstance(
139154
ReplicationFactor.THREE), "admin");
@@ -162,29 +177,39 @@ void testTransitionDeletingToClosedState() throws IOException, InvalidStateTrans
162177
assertEquals(LifeCycleState.DELETING, containerManager.getContainer(cid).getState());
163178
assertEquals(LifeCycleState.DELETING, containerManager.getContainer(ecCid).getState());
164179

165-
// DELETING -> CLOSED
166-
containerManager.transitionDeletingToClosedState(cid);
167-
containerManager.transitionDeletingToClosedState(ecCid);
180+
if (desiredState == LifeCycleState.DELETED) {
181+
// DELETING -> DELETED
182+
containerManager.updateContainerState(cid, HddsProtos.LifeCycleEvent.CLEANUP);
183+
containerManager.updateContainerState(ecCid, HddsProtos.LifeCycleEvent.CLEANUP);
184+
assertEquals(LifeCycleState.DELETED, containerManager.getContainer(cid).getState());
185+
assertEquals(LifeCycleState.DELETED, containerManager.getContainer(ecCid).getState());
186+
}
187+
188+
// DELETING / DELETED -> CLOSED
189+
containerManager.transitionDeletingOrDeletedToClosedState(cid);
190+
containerManager.transitionDeletingOrDeletedToClosedState(ecCid);
168191
// the containers should be back in CLOSED state now
169192
assertEquals(LifeCycleState.CLOSED, containerManager.getContainer(cid).getState());
170193
assertEquals(LifeCycleState.CLOSED, containerManager.getContainer(ecCid).getState());
171194
}
172195

173196
@Test
174-
void testTransitionDeletingToClosedStateAllowsOnlyDeletingContainers() throws IOException {
197+
void testTransitionContainerToClosedStateAllowOnlyDeletingOrDeletedContainers() throws IOException {
198+
// Negative test for Ratis/EC container OPEN -> CLOSED transition
199+
175200
// test for RATIS container
176201
final ContainerInfo container = containerManager.allocateContainer(
177202
RatisReplicationConfig.getInstance(
178203
ReplicationFactor.THREE), "admin");
179204
final ContainerID cid = container.containerID();
180205
assertEquals(LifeCycleState.OPEN, containerManager.getContainer(cid).getState());
181-
assertThrows(IOException.class, () -> containerManager.transitionDeletingToClosedState(cid));
206+
assertThrows(IOException.class, () -> containerManager.transitionDeletingOrDeletedToClosedState(cid));
182207

183208
// test for EC container
184209
final ContainerInfo ecContainer = containerManager.allocateContainer(new ECReplicationConfig(3, 2), "admin");
185210
final ContainerID ecCid = ecContainer.containerID();
186211
assertEquals(LifeCycleState.OPEN, containerManager.getContainer(ecCid).getState());
187-
assertThrows(IOException.class, () -> containerManager.transitionDeletingToClosedState(ecCid));
212+
assertThrows(IOException.class, () -> containerManager.transitionDeletingOrDeletedToClosedState(ecCid));
188213
}
189214

190215
@Test

0 commit comments

Comments
 (0)