Skip to content

Commit 31058b2

Browse files
committed
HDFS-11446. TestMaintenanceState#testWithNNAndDNRestart fails intermittently. Contributed by Yiqun Lin.
1 parent 89bb8bf commit 31058b2

File tree

1 file changed

+66
-62
lines changed

1 file changed

+66
-62
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMaintenanceState.java

Lines changed: 66 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
package org.apache.hadoop.hdfs;
1919

2020
import static org.junit.Assert.assertEquals;
21-
import static org.junit.Assert.assertNull;
2221
import static org.junit.Assert.assertTrue;
2322
import static org.junit.Assert.fail;
2423

@@ -30,12 +29,7 @@
3029
import java.util.List;
3130
import java.util.Map;
3231

33-
import com.google.common.collect.Lists;
3432
import org.apache.hadoop.fs.FSDataOutputStream;
35-
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
36-
import org.junit.Assert;
37-
import org.slf4j.Logger;
38-
import org.slf4j.LoggerFactory;
3933
import org.apache.hadoop.fs.FileSystem;
4034
import org.apache.hadoop.fs.Path;
4135
import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
@@ -48,8 +42,16 @@
4842
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
4943
import org.apache.hadoop.hdfs.server.datanode.DataNode;
5044
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
45+
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
46+
import org.apache.hadoop.test.GenericTestUtils;
5147
import org.apache.hadoop.util.Time;
48+
import org.junit.Assert;
5249
import org.junit.Test;
50+
import org.slf4j.Logger;
51+
import org.slf4j.LoggerFactory;
52+
53+
import com.google.common.base.Supplier;
54+
import com.google.common.collect.Lists;
5355

5456
/**
5557
* This class tests node maintenance.
@@ -125,8 +127,8 @@ public void testTakeNodeOutOfEnteringMaintenance() throws Exception {
125127

126128
// When node is in ENTERING_MAINTENANCE state, it can still serve read
127129
// requests
128-
assertNull(checkWithRetry(ns, fileSys, file, replicas, null,
129-
nodeOutofService));
130+
checkWithRetry(ns, fileSys, file, replicas, null,
131+
nodeOutofService);
130132

131133
putNodeInService(0, nodeOutofService.getDatanodeUuid());
132134

@@ -387,8 +389,8 @@ private void testExpectedReplication(int replicationFactor,
387389

388390
// The block should be replicated to another datanode to meet
389391
// expected replication count.
390-
assertNull(checkWithRetry(ns, fileSys, file, expectedReplicasInRead,
391-
nodeOutofService));
392+
checkWithRetry(ns, fileSys, file, expectedReplicasInRead,
393+
nodeOutofService);
392394

393395
cleanupFile(fileSys, file);
394396
teardown();
@@ -548,19 +550,19 @@ public void testTransitionToDecommission() throws IOException {
548550
client.datanodeReport(DatanodeReportType.LIVE).length);
549551

550552
// test 1, verify the replica in IN_MAINTENANCE state isn't in LocatedBlock
551-
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
552-
nodeOutofService));
553+
checkWithRetry(ns, fileSys, file, replicas - 1,
554+
nodeOutofService);
553555

554556
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), 0, null,
555557
AdminStates.DECOMMISSIONED);
556558

557559
// test 2 after decommission has completed, the replication count is
558560
// replicas + 1 which includes the decommissioned node.
559-
assertNull(checkWithRetry(ns, fileSys, file, replicas + 1, null));
561+
checkWithRetry(ns, fileSys, file, replicas + 1, null);
560562

561563
// test 3, put the node in service, replication count should restore.
562564
putNodeInService(0, nodeOutofService.getDatanodeUuid());
563-
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
565+
checkWithRetry(ns, fileSys, file, replicas, null);
564566

565567
cleanupFile(fileSys, file);
566568
}
@@ -587,8 +589,8 @@ public void testTransitionFromDecommissioning() throws IOException {
587589
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), Long.MAX_VALUE,
588590
null, AdminStates.IN_MAINTENANCE);
589591

590-
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
591-
nodeOutofService));
592+
checkWithRetry(ns, fileSys, file, replicas - 1,
593+
nodeOutofService);
592594

593595
cleanupFile(fileSys, file);
594596
}
@@ -631,10 +633,10 @@ private void testDecommissionDifferentNodeAfterMaintenance(int repl)
631633
takeNodeOutofService(0, decommissionDNUuid, 0, null, maintenanceNodes,
632634
AdminStates.DECOMMISSIONED);
633635
// Out of the replicas returned, one is the decommissioned node.
634-
assertNull(checkWithRetry(ns, fileSys, file, repl, maintenanceDN));
636+
checkWithRetry(ns, fileSys, file, repl, maintenanceDN);
635637

636638
putNodeInService(0, maintenanceDN);
637-
assertNull(checkWithRetry(ns, fileSys, file, repl + 1, null));
639+
checkWithRetry(ns, fileSys, file, repl + 1, null);
638640

639641
cleanupFile(fileSys, file);
640642
teardown();
@@ -663,15 +665,15 @@ public void testMultipleNodesMaintenance() throws Exception {
663665
AdminStates.IN_MAINTENANCE);
664666

665667
// Verify file replication matches maintenance state min replication
666-
assertNull(checkWithRetry(ns, fileSys, file, 1, null, nodes[0]));
668+
checkWithRetry(ns, fileSys, file, 1, null, nodes[0]);
667669

668670
// Put the maintenance nodes back in service
669671
for (DatanodeInfo datanodeInfo : maintenanceDN) {
670672
putNodeInService(0, datanodeInfo);
671673
}
672674

673675
// Verify file replication catching up to the old state
674-
assertNull(checkWithRetry(ns, fileSys, file, repl, null));
676+
checkWithRetry(ns, fileSys, file, repl, null);
675677

676678
cleanupFile(fileSys, file);
677679
}
@@ -720,19 +722,19 @@ private void testChangeReplicationFactor(int oldFactor, int newFactor,
720722

721723
// Verify that the nodeOutofService remains in blocksMap and
722724
// # of live replicas For read operation is expected.
723-
assertNull(checkWithRetry(ns, fileSys, file, oldFactor - 1,
724-
nodeOutofService));
725+
checkWithRetry(ns, fileSys, file, oldFactor - 1,
726+
nodeOutofService);
725727

726728
final DFSClient client = getDfsClient(0);
727729
client.setReplication(file.toString(), (short)newFactor);
728730

729731
// Verify that the nodeOutofService remains in blocksMap and
730732
// # of live replicas for read operation.
731-
assertNull(checkWithRetry(ns, fileSys, file, expectedLiveReplicas,
732-
nodeOutofService));
733+
checkWithRetry(ns, fileSys, file, expectedLiveReplicas,
734+
nodeOutofService);
733735

734736
putNodeInService(0, nodeOutofService.getDatanodeUuid());
735-
assertNull(checkWithRetry(ns, fileSys, file, newFactor, null));
737+
checkWithRetry(ns, fileSys, file, newFactor, null);
736738

737739
cleanupFile(fileSys, file);
738740
teardown();
@@ -765,8 +767,8 @@ public void testTakeDeadNodeOutOfMaintenance() throws Exception {
765767
getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
766768
AdminStates.IN_MAINTENANCE);
767769

768-
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
769-
nodeOutofService));
770+
checkWithRetry(ns, fileSys, file, replicas - 1,
771+
nodeOutofService);
770772

771773
final DFSClient client = getDfsClient(0);
772774
assertEquals("All datanodes must be alive", numDatanodes,
@@ -779,16 +781,16 @@ public void testTakeDeadNodeOutOfMaintenance() throws Exception {
779781
client.datanodeReport(DatanodeReportType.LIVE).length);
780782

781783
// Dead maintenance node's blocks should remain in block map.
782-
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
783-
nodeOutofService));
784+
checkWithRetry(ns, fileSys, file, replicas - 1,
785+
nodeOutofService);
784786

785787
// When dead maintenance mode is transitioned to out of maintenance mode,
786788
// its blocks should be removed from block map.
787789
// This will then trigger replication to restore the live replicas back
788790
// to replication factor.
789791
putNodeInService(0, nodeOutofService.getDatanodeUuid());
790-
assertNull(checkWithRetry(ns, fileSys, file, replicas, nodeOutofService,
791-
null));
792+
checkWithRetry(ns, fileSys, file, replicas, nodeOutofService,
793+
null);
792794

793795
cleanupFile(fileSys, file);
794796
}
@@ -821,8 +823,8 @@ public void testWithNNAndDNRestart() throws Exception {
821823
getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
822824
AdminStates.IN_MAINTENANCE);
823825

824-
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
825-
nodeOutofService));
826+
checkWithRetry(ns, fileSys, file, replicas - 1,
827+
nodeOutofService);
826828

827829
DFSClient client = getDfsClient(0);
828830
assertEquals("All datanodes must be alive", numDatanodes,
@@ -836,23 +838,23 @@ public void testWithNNAndDNRestart() throws Exception {
836838
client.datanodeReport(DatanodeReportType.LIVE).length);
837839

838840
// Dead maintenance node's blocks should remain in block map.
839-
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
840-
nodeOutofService));
841+
checkWithRetry(ns, fileSys, file, replicas - 1,
842+
nodeOutofService);
841843

842844
// restart nn, nn will restore 3 live replicas given it doesn't
843845
// know the maintenance node has the replica.
844846
getCluster().restartNameNode(0);
845847
ns = getCluster().getNamesystem(0);
846-
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
848+
checkWithRetry(ns, fileSys, file, replicas, null);
847849

848850
// restart dn, nn has 1 maintenance replica and 3 live replicas.
849851
getCluster().restartDataNode(dnProp, true);
850852
getCluster().waitActive();
851-
assertNull(checkWithRetry(ns, fileSys, file, replicas, nodeOutofService));
853+
checkWithRetry(ns, fileSys, file, replicas, nodeOutofService);
852854

853855
// Put the node in service, a redundant replica should be removed.
854856
putNodeInService(0, nodeOutofService.getDatanodeUuid());
855-
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
857+
checkWithRetry(ns, fileSys, file, replicas, null);
856858

857859
cleanupFile(fileSys, file);
858860
}
@@ -878,12 +880,12 @@ public void testWriteAfterMaintenance() throws IOException {
878880
writeFile(fileSys, file, replicas, 2);
879881

880882
// Verify nodeOutofService wasn't chosen for write operation.
881-
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
882-
nodeOutofService, null));
883+
checkWithRetry(ns, fileSys, file, replicas - 1,
884+
nodeOutofService, null);
883885

884886
// Put the node back to service, live replicas should be restored.
885887
putNodeInService(0, nodeOutofService.getDatanodeUuid());
886-
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
888+
checkWithRetry(ns, fileSys, file, replicas, null);
887889

888890
cleanupFile(fileSys, file);
889891
}
@@ -934,12 +936,12 @@ public void testInvalidation() throws IOException {
934936
client.setReplication(file.toString(), (short) 1);
935937

936938
// Verify the nodeOutofService remains in blocksMap.
937-
assertNull(checkWithRetry(ns, fileSys, file, 1, nodeOutofService));
939+
checkWithRetry(ns, fileSys, file, 1, nodeOutofService);
938940

939941
// Restart NN and verify the nodeOutofService remains in blocksMap.
940942
getCluster().restartNameNode(0);
941943
ns = getCluster().getNamesystem(0);
942-
assertNull(checkWithRetry(ns, fileSys, file, 1, nodeOutofService));
944+
checkWithRetry(ns, fileSys, file, 1, nodeOutofService);
943945

944946
cleanupFile(fileSys, file);
945947
}
@@ -1081,30 +1083,32 @@ static String checkFile(FSNamesystem ns, FileSystem fileSys,
10811083
return null;
10821084
}
10831085

1084-
static String checkWithRetry(FSNamesystem ns, FileSystem fileSys,
1085-
Path name, int repl, DatanodeInfo inMaintenanceNode)
1086-
throws IOException {
1087-
return checkWithRetry(ns, fileSys, name, repl, inMaintenanceNode,
1086+
static void checkWithRetry(FSNamesystem ns, FileSystem fileSys, Path name,
1087+
int repl, DatanodeInfo inMaintenanceNode) {
1088+
checkWithRetry(ns, fileSys, name, repl, inMaintenanceNode,
10881089
inMaintenanceNode);
10891090
}
10901091

1091-
static String checkWithRetry(FSNamesystem ns, FileSystem fileSys,
1092-
Path name, int repl, DatanodeInfo excludedNode,
1093-
DatanodeInfo underMaintenanceNode) throws IOException {
1094-
int tries = 0;
1095-
String output = null;
1096-
while (tries++ < 200) {
1097-
try {
1098-
Thread.sleep(100);
1099-
output = checkFile(ns, fileSys, name, repl, excludedNode,
1100-
underMaintenanceNode);
1101-
if (output == null) {
1102-
break;
1092+
static void checkWithRetry(final FSNamesystem ns, final FileSystem fileSys,
1093+
final Path name, final int repl, final DatanodeInfo excludedNode,
1094+
final DatanodeInfo underMaintenanceNode) {
1095+
try {
1096+
GenericTestUtils.waitFor(new Supplier<Boolean>() {
1097+
1098+
@Override
1099+
public Boolean get() {
1100+
String output = null;
1101+
try {
1102+
output = checkFile(ns, fileSys, name, repl, excludedNode,
1103+
underMaintenanceNode);
1104+
} catch (Exception ignored) {
1105+
}
1106+
1107+
return (output == null);
11031108
}
1104-
} catch (InterruptedException ie) {
1105-
}
1109+
}, 100, 60000);
1110+
} catch (Exception ignored) {
11061111
}
1107-
return output;
11081112
}
11091113

11101114
static private DatanodeInfo[] getFirstBlockReplicasDatanodeInfos(

0 commit comments

Comments
 (0)