HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang
(cherry picked from commit 7bb902bc0d
)
This commit is contained in:
parent
eb045ea056
commit
e452163a06
@ -99,6 +99,19 @@ public class ImageServlet extends HttpServlet {
|
|||||||
"recent.image.check.enabled";
|
"recent.image.check.enabled";
|
||||||
public static final boolean RECENT_IMAGE_CHECK_ENABLED_DEFAULT = true;
|
public static final boolean RECENT_IMAGE_CHECK_ENABLED_DEFAULT = true;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Specify a relaxation for the time delta check, the relaxation is to account
|
||||||
|
* for the scenario that there are chances that minor time difference (e.g.
|
||||||
|
* due to image upload delay, or minor machine clock skew) can cause ANN to
|
||||||
|
* reject a fsImage too aggressively.
|
||||||
|
*/
|
||||||
|
private static double recentImageCheckTimePrecision = 0.75;
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
static void setRecentImageCheckTimePrecision(double ratio) {
|
||||||
|
recentImageCheckTimePrecision = ratio;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void doGet(final HttpServletRequest request,
|
public void doGet(final HttpServletRequest request,
|
||||||
final HttpServletResponse response) throws ServletException, IOException {
|
final HttpServletResponse response) throws ServletException, IOException {
|
||||||
@ -592,6 +605,9 @@ public Void run() throws Exception {
|
|||||||
long checkpointPeriod =
|
long checkpointPeriod =
|
||||||
conf.getTimeDuration(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY,
|
conf.getTimeDuration(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY,
|
||||||
DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT, TimeUnit.SECONDS);
|
DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT, TimeUnit.SECONDS);
|
||||||
|
checkpointPeriod = Math.round(
|
||||||
|
checkpointPeriod * recentImageCheckTimePrecision);
|
||||||
|
|
||||||
long checkpointTxnCount =
|
long checkpointTxnCount =
|
||||||
conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
|
conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
|
||||||
DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
|
DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
|
||||||
@ -612,21 +628,24 @@ public Void run() throws Exception {
|
|||||||
// a new fsImage
|
// a new fsImage
|
||||||
// 1. most recent image's txid is too far behind
|
// 1. most recent image's txid is too far behind
|
||||||
// 2. last checkpoint time was too old
|
// 2. last checkpoint time was too old
|
||||||
response.sendError(HttpServletResponse.SC_CONFLICT,
|
String message = "Rejecting a fsimage due to small time delta "
|
||||||
"Most recent checkpoint is neither too far behind in "
|
+ "and txnid delta. Time since previous checkpoint is "
|
||||||
+ "txid, nor too old. New txnid cnt is "
|
+ timeDelta + " expecting at least " + checkpointPeriod
|
||||||
+ (txid - lastCheckpointTxid)
|
+ " txnid delta since previous checkpoint is " +
|
||||||
+ ", expecting at least " + checkpointTxnCount
|
(txid - lastCheckpointTxid) + " expecting at least "
|
||||||
+ " unless too long since last upload.");
|
+ checkpointTxnCount;
|
||||||
|
LOG.info(message);
|
||||||
|
response.sendError(HttpServletResponse.SC_CONFLICT, message);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (nnImage.getStorage().findImageFile(nnf, txid) != null) {
|
if (nnImage.getStorage().findImageFile(nnf, txid) != null) {
|
||||||
response.sendError(HttpServletResponse.SC_CONFLICT,
|
String message = "Either current namenode has checkpointed or "
|
||||||
"Either current namenode has checkpointed or "
|
+ "another checkpointer already uploaded an "
|
||||||
+ "another checkpointer already uploaded an "
|
+ "checkpoint for txid " + txid;
|
||||||
+ "checkpoint for txid " + txid);
|
LOG.info(message);
|
||||||
|
response.sendError(HttpServletResponse.SC_CONFLICT, message);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2464,7 +2464,7 @@ public void testLegacyOivImage() throws Exception {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test(timeout = 300000)
|
@Test(timeout = 300000)
|
||||||
public void testActiveRejectSmallerDeltaImage() throws Exception {
|
public void testActiveRejectSmallerTxidDeltaImage() throws Exception {
|
||||||
MiniDFSCluster cluster = null;
|
MiniDFSCluster cluster = null;
|
||||||
Configuration conf = new HdfsConfiguration();
|
Configuration conf = new HdfsConfiguration();
|
||||||
// Set the delta txid threshold to 10
|
// Set the delta txid threshold to 10
|
||||||
@ -2517,6 +2517,57 @@ public void testActiveRejectSmallerDeltaImage() throws Exception {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that even with txid and time delta threshold, by having time
|
||||||
|
* relaxation, SBN can still upload images to ANN.
|
||||||
|
*
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testActiveImageWithTimeDeltaRelaxation() throws Exception {
|
||||||
|
Configuration conf = new HdfsConfiguration();
|
||||||
|
// Set the delta txid threshold to some arbitrarily large value, so
|
||||||
|
// it does not trigger a checkpoint during this test.
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1000000);
|
||||||
|
// Set the delta time threshold to some arbitrarily large value, so
|
||||||
|
// it does not trigger a checkpoint during this test.
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 900000);
|
||||||
|
// Set relaxation to 0, means time delta = 0 from previous image is fine,
|
||||||
|
// this will effectively disable reject small delta image
|
||||||
|
ImageServlet.setRecentImageCheckTimePrecision(0);
|
||||||
|
|
||||||
|
SecondaryNameNode secondary = null;
|
||||||
|
|
||||||
|
try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
|
||||||
|
.numDataNodes(0).format(true).build()) {
|
||||||
|
// enable small delta rejection
|
||||||
|
NameNode active = cluster.getNameNode();
|
||||||
|
active.httpServer.getHttpServer()
|
||||||
|
.setAttribute(RECENT_IMAGE_CHECK_ENABLED, true);
|
||||||
|
|
||||||
|
secondary = startSecondaryNameNode(conf);
|
||||||
|
|
||||||
|
FileSystem fs = cluster.getFileSystem();
|
||||||
|
assertEquals(0, active.getNamesystem().getFSImage()
|
||||||
|
.getMostRecentCheckpointTxId());
|
||||||
|
|
||||||
|
// create 5 dir.
|
||||||
|
for (int i = 0; i < 5; i++) {
|
||||||
|
fs.mkdirs(new Path("dir-" + i));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Checkpoint 1st
|
||||||
|
secondary.doCheckpoint();
|
||||||
|
// at this point, despite this is a small delta change, w.r.t both
|
||||||
|
// txid and time delta, due to we set relaxation to 0, this image
|
||||||
|
// still gets accepted
|
||||||
|
assertEquals(9, active.getNamesystem().getFSImage()
|
||||||
|
.getMostRecentCheckpointTxId());
|
||||||
|
} finally {
|
||||||
|
cleanup(secondary);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static void cleanup(SecondaryNameNode snn) {
|
private static void cleanup(SecondaryNameNode snn) {
|
||||||
if (snn != null) {
|
if (snn != null) {
|
||||||
try {
|
try {
|
||||||
|
Loading…
Reference in New Issue
Block a user