From 5bf7e594d7d54e5295fe4240c3d60c08d4755ab7 Mon Sep 17 00:00:00 2001 From: Xiao Chen Date: Thu, 28 Dec 2017 11:52:49 -0800 Subject: [PATCH] HDFS-9023. When NN is not able to identify DN for replication, reason behind it can be logged. --- .../BlockPlacementPolicyDefault.java | 74 ++++++++++++++++--- .../blockmanagement/DatanodeDescriptor.java | 2 +- 2 files changed, 64 insertions(+), 12 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java index b925feb99b..a37cda4258 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java @@ -62,6 +62,28 @@ protected StringBuilder initialValue() { } }; + private static final ThreadLocal> + CHOOSE_RANDOM_REASONS = ThreadLocal + .withInitial(() -> new HashMap()); + + private enum NodeNotChosenReason { + NOT_IN_SERVICE("the node isn't in service"), + NODE_STALE("the node is stale"), + NODE_TOO_BUSY("the node is too busy"), + TOO_MANY_NODES_ON_RACK("the rack has too many chosen nodes"), + NOT_ENOUGH_STORAGE_SPACE("no enough storage space to place the block"); + + private final String text; + + NodeNotChosenReason(final String logText) { + text = logText; + } + + private String getText() { + return text; + } + } + protected boolean considerLoad; protected double considerLoadFactor; private boolean preferLocalNode; @@ -711,6 +733,7 @@ protected DatanodeStorageInfo chooseRandom(int numOfReplicas, builder.setLength(0); builder.append("["); } + CHOOSE_RANDOM_REASONS.get().clear(); boolean badTarget = false; DatanodeStorageInfo firstChosen = null; while (numOfReplicas > 0) { @@ -781,14 +804,24 @@ protected DatanodeStorageInfo chooseRandom(int numOfReplicas, } if (numOfReplicas>0) { String detail = enableDebugLogging; - if (LOG.isDebugEnabled()) { - if (badTarget && builder != null) { - detail = builder.toString(); + if (LOG.isDebugEnabled() && builder != null) { + detail = builder.toString(); + if (badTarget) { builder.setLength(0); } else { + if (detail.length() > 1) { + // only log if there's more than "[", which is always appended at + // the beginning of this method. + LOG.debug(detail); + } detail = ""; } } + final HashMap reasonMap = + CHOOSE_RANDOM_REASONS.get(); + if (!reasonMap.isEmpty()) { + LOG.info("Not enough replicas was chosen. Reason:{}", reasonMap); + } throw new NotEnoughReplicasException(detail); } @@ -834,19 +867,38 @@ DatanodeStorageInfo chooseStorage4Block(DatanodeDescriptor dnd, if (storage != null) { results.add(storage); } else { - logNodeIsNotChosen(dnd, "no good storage to place the block "); + logNodeIsNotChosen(dnd, NodeNotChosenReason.NOT_ENOUGH_STORAGE_SPACE, + " for storage type " + storageType); } return storage; } private static void logNodeIsNotChosen(DatanodeDescriptor node, - String reason) { + NodeNotChosenReason reason) { + logNodeIsNotChosen(node, reason, null); + } + + private static void logNodeIsNotChosen(DatanodeDescriptor node, + NodeNotChosenReason reason, String reasonDetails) { + assert reason != null; if (LOG.isDebugEnabled()) { // build the error message for later use. debugLoggingBuilder.get() .append("\n Datanode ").append(node) - .append(" is not chosen since ").append(reason).append("."); + .append(" is not chosen since ").append(reason.getText()); + if (reasonDetails != null) { + debugLoggingBuilder.get().append(" ").append(reasonDetails); + } + debugLoggingBuilder.get().append("."); } + // always populate reason map to log high level reasons. + final HashMap reasonMap = + CHOOSE_RANDOM_REASONS.get(); + Integer base = reasonMap.get(reason); + if (base == null) { + base = 0; + } + reasonMap.put(reason, base + 1); } /** @@ -868,13 +920,13 @@ boolean isGoodDatanode(DatanodeDescriptor node, boolean avoidStaleNodes) { // check if the node is (being) decommissioned if (!node.isInService()) { - logNodeIsNotChosen(node, "the node isn't in service."); + logNodeIsNotChosen(node, NodeNotChosenReason.NOT_IN_SERVICE); return false; } if (avoidStaleNodes) { if (node.isStale(this.staleInterval)) { - logNodeIsNotChosen(node, "the node is stale "); + logNodeIsNotChosen(node, NodeNotChosenReason.NODE_STALE); return false; } } @@ -885,8 +937,8 @@ boolean isGoodDatanode(DatanodeDescriptor node, stats.getInServiceXceiverAverage(); final int nodeLoad = node.getXceiverCount(); if (nodeLoad > maxLoad) { - logNodeIsNotChosen(node, "the node is too busy (load: " + nodeLoad - + " > " + maxLoad + ") "); + logNodeIsNotChosen(node, NodeNotChosenReason.NODE_TOO_BUSY, + "(load: " + nodeLoad + " > " + maxLoad + ")"); return false; } } @@ -901,7 +953,7 @@ boolean isGoodDatanode(DatanodeDescriptor node, } } if (counter > maxTargetPerRack) { - logNodeIsNotChosen(node, "the rack has too many chosen nodes "); + logNodeIsNotChosen(node, NodeNotChosenReason.TOO_MANY_NODES_ON_RACK); return false; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java index fc587086ae..618bc13c5b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java @@ -764,7 +764,7 @@ public DatanodeStorageInfo chooseStorage4Block(StorageType t, } } if (requiredSize > remaining - scheduledSize) { - LOG.debug( + BlockPlacementPolicy.LOG.debug( "The node {} does not have enough {} space (required={}," + " scheduled={}, remaining={}).", this, t, requiredSize, scheduledSize, remaining);