diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index f359d86df7b2a..c4aa37782531b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -7174,6 +7174,11 @@ public synchronized void verifyToken(DelegationTokenIdentifier identifier, public EditLogTailer getEditLogTailer() { return editLogTailer; } + + @VisibleForTesting + public long getStandbyLastCheckpointTime() { + return standbyCheckpointer.getLastCheckpointTime(); + } @VisibleForTesting public void setEditLogTailerForTests(EditLogTailer tailer) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java index ec848668d2561..8d914171ef2ad 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java @@ -342,7 +342,7 @@ public TransferFsImage.TransferResult call() throw ie; } - if (!ioes.isEmpty()) { + if (ioes.size() > activeNNAddresses.size() / 2) { throw MultipleIOException.createIOException(ioes); } } @@ -375,6 +375,11 @@ static int getCanceledCount() { return canceledCount; } + @VisibleForTesting + public long getLastCheckpointTime() { + return lastCheckpointTime; + } + private long countUncheckpointedTxns() { FSImage img = namesystem.getFSImage(); return img.getCorrectLastAppliedOrWrittenTxId() - diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java index 8256caab762a9..c71a27ebda3c4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java @@ -663,7 +663,46 @@ private void doCreate() throws IOException { out.write(42); out.close(); } - + + @Test(timeout = 300000) + public void testPutFsimagePartFailed() throws Exception { + for (int i = 1; i < NUM_NNS; i++) { + cluster.shutdownNameNode(i); + + // Make true checkpoint for DFS_NAMENODE_CHECKPOINT_PERIOD_KEY + cluster.getConfiguration(i).setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 3); + cluster.getConfiguration(i).setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1000); + } + doEdits(0, 10); + cluster.transitionToStandby(0); + + for (int i = 1; i < NUM_NNS; i++) { + cluster.restartNameNode(i, false); + } + cluster.waitClusterUp(); + setNNs(); + + for (int i = 0; i < NUM_NNS; i++) { + // Once the standby catches up, it should do a checkpoint + // and save to local directories. + HATestUtil.waitForCheckpoint(cluster, i, ImmutableList.of(12)); + } + + long snnCheckpointTime1 = nns[1].getNamesystem().getStandbyLastCheckpointTime(); + cluster.transitionToActive(0); + cluster.transitionToObserver(2); + cluster.shutdownNameNode(2); + + doEdits(11, 20); + nns[0].getRpcServer().rollEditLog(); + HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(23)); + + long snnCheckpointTime2 = nns[1].getNamesystem().getStandbyLastCheckpointTime(); + + // Make sure that standby namenode checkpoint success and update the lastCheckpointTime + // even though it send fsimage to nn2 failed because nn2 is shut down. + assertTrue(snnCheckpointTime2 > snnCheckpointTime1); + } /** * A codec which just slows down the saving of the image significantly