Skip to content

Commit 3bf0ae1

Browse files
author
eddy.cao
committed
Fix the SNN repeatedly checkpoint after fsimage transfer failure on one of the multiple NNs
1 parent 1abdf72 commit 3bf0ae1

File tree

3 files changed

+52
-2
lines changed

3 files changed

+52
-2
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7174,6 +7174,11 @@ public synchronized void verifyToken(DelegationTokenIdentifier identifier,
71747174
public EditLogTailer getEditLogTailer() {
71757175
return editLogTailer;
71767176
}
7177+
7178+
@VisibleForTesting
7179+
public long getStandbyLastCheckpointTime() {
7180+
return standbyCheckpointer.getLastCheckpointTime();
7181+
}
71777182

71787183
@VisibleForTesting
71797184
public void setEditLogTailerForTests(EditLogTailer tailer) {

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ public TransferFsImage.TransferResult call()
342342
throw ie;
343343
}
344344

345-
if (!ioes.isEmpty()) {
345+
if (ioes.size() > activeNNAddresses.size() / 2) {
346346
throw MultipleIOException.createIOException(ioes);
347347
}
348348
}
@@ -375,6 +375,11 @@ static int getCanceledCount() {
375375
return canceledCount;
376376
}
377377

378+
@VisibleForTesting
379+
public long getLastCheckpointTime() {
380+
return lastCheckpointTime;
381+
}
382+
378383
private long countUncheckpointedTxns() {
379384
FSImage img = namesystem.getFSImage();
380385
return img.getCorrectLastAppliedOrWrittenTxId() -

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -663,7 +663,47 @@ private void doCreate() throws IOException {
663663
out.write(42);
664664
out.close();
665665
}
666-
666+
667+
@Test(timeout = 300000)
668+
public void testPutFsimagePartFailed() throws Exception {
669+
for (int i = 1; i < NUM_NNS; i++) {
670+
cluster.shutdownNameNode(i);
671+
672+
// Make true checkpoint for DFS_NAMENODE_CHECKPOINT_PERIOD_KEY
673+
cluster.getConfiguration(i).setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 3);
674+
cluster.getConfiguration(i).setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1000);
675+
}
676+
doEdits(0, 10);
677+
cluster.transitionToStandby(0);
678+
679+
// Standby NNs do checkpoint without active NN available.
680+
for (int i = 1; i < NUM_NNS; i++) {
681+
cluster.restartNameNode(i, false);
682+
}
683+
cluster.waitClusterUp();
684+
setNNs();
685+
686+
for (int i = 0; i < NUM_NNS; i++) {
687+
// Once the standby catches up, it should do a checkpoint
688+
// and save to local directories.
689+
HATestUtil.waitForCheckpoint(cluster, i, ImmutableList.of(12));
690+
}
691+
692+
long snnCheckpointTime1 = nns[1].getNamesystem().getStandbyLastCheckpointTime();
693+
cluster.transitionToActive(0);
694+
cluster.transitionToObserver(2);
695+
cluster.shutdownNameNode(2);
696+
697+
doEdits(11, 20);
698+
nns[0].getRpcServer().rollEditLog();
699+
HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(23));
700+
701+
long snnCheckpointTime2 = nns[1].getNamesystem().getStandbyLastCheckpointTime();
702+
703+
// Make sure that both standby and active NNs' lastCheckpointTime intervals are larger
704+
// than 3 DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY.
705+
assertTrue(snnCheckpointTime2 > snnCheckpointTime1);
706+
}
667707

668708
/**
669709
* A codec which just slows down the saving of the image significantly

0 commit comments

Comments
 (0)