HDDS-1365. Fix error handling in KeyValueContainerCheck. Contributed by Supratim Deka.

This commit is contained in:
Yiqun Lin 2019-04-03 14:01:30 +08:00
parent aaaf856f4b
commit f96fb05a2b
2 changed files with 81 additions and 215 deletions

View File

@ -68,9 +68,26 @@ public KeyValueContainerCheck(String metadataPath, Configuration conf,
} }
/** /**
* fast checks are basic and do not look inside the metadata files. * Run basic integrity checks on container metadata.
* Or into the structures on disk. These checks can be done on Open * These checks do not look inside the metadata files.
* containers as well without concurrency implications * Applicable for OPEN containers.
*
* @return true : corruption detected, false : no corruption.
*/
public boolean fastCheck() {
boolean corruption = false;
try {
basicChecks();
} catch (IOException e) {
handleCorruption(e);
corruption = true;
}
return corruption;
}
/**
* Checks : * Checks :
* 1. check directory layout * 1. check directory layout
* 2. check container file * 2. check container file
@ -78,24 +95,14 @@ public KeyValueContainerCheck(String metadataPath, Configuration conf,
* @return void * @return void
*/ */
public KvCheckError fastCheck() { private void basicChecks() throws IOException {
KvCheckError error; LOG.trace("Running basic checks for container {};", containerID);
LOG.trace("Running fast check for container {};", containerID);
error = loadContainerData(); loadContainerData();
if (error != KvCheckError.ERROR_NONE) {
return error;
}
error = checkLayout(); checkLayout();
if (error != KvCheckError.ERROR_NONE) { checkContainerFile();
return error;
}
error = checkContainerFile();
return error;
} }
/** /**
@ -107,129 +114,80 @@ public KvCheckError fastCheck() {
* <p> * <p>
* fullCheck is a superset of fastCheck * fullCheck is a superset of fastCheck
* *
* @return void * @return true : corruption detected, false : no corruption.
*/ */
public KvCheckError fullCheck() { public boolean fullCheck() {
/** boolean corruption = false;
*/ try {
KvCheckError error; basicChecks();
checkBlockDB();
error = fastCheck(); } catch (IOException e) {
if (error != KvCheckError.ERROR_NONE) { handleCorruption(e);
corruption = true;
LOG.trace("fastCheck failed, aborting full check for Container {}",
containerID);
return error;
} }
error = checkBlockDB(); return corruption;
return error;
} }
/** /**
* Check the integrity of the directory structure of the container. * Check the integrity of the directory structure of the container.
*
* @return error code or ERROR_NONE
*/ */
private KvCheckError checkLayout() { private void checkLayout() throws IOException {
boolean success;
KvCheckError error = KvCheckError.ERROR_NONE;
// is metadataPath accessible as a directory? // is metadataPath accessible as a directory?
try {
checkDirPath(metadataPath); checkDirPath(metadataPath);
} catch (IOException ie) {
error = KvCheckError.METADATA_PATH_ACCESS;
handleCorruption(ie.getMessage(), error, ie);
return error;
}
String chunksPath = onDiskContainerData.getChunksPath();
// is chunksPath accessible as a directory? // is chunksPath accessible as a directory?
try { String chunksPath = onDiskContainerData.getChunksPath();
checkDirPath(chunksPath); checkDirPath(chunksPath);
} catch (IOException ie) {
error = KvCheckError.CHUNKS_PATH_ACCESS;
handleCorruption(ie.getMessage(), error, ie);
return error;
}
return error;
} }
private void checkDirPath(String path) throws IOException { private void checkDirPath(String path) throws IOException {
File dirPath = new File(path); File dirPath = new File(path);
String errStr = null; String errStr = null;
boolean success = true;
try { try {
if (!dirPath.isDirectory()) { if (!dirPath.isDirectory()) {
success = false;
errStr = "Not a directory [" + path + "]"; errStr = "Not a directory [" + path + "]";
throw new IOException(errStr);
} }
} catch (SecurityException se) { } catch (SecurityException se) {
throw new IOException("Security exception checking dir [" throw new IOException("Security exception checking dir ["
+ path + "]", se); + path + "]", se);
} catch (Exception e) {
throw new IOException("Generic exception checking dir ["
+ path + "]", e);
} }
try {
String[] ls = dirPath.list(); String[] ls = dirPath.list();
if (ls == null) { if (ls == null) {
// null result implies operation failed // null result implies operation failed
success = false;
errStr = "null listing for directory [" + path + "]"; errStr = "null listing for directory [" + path + "]";
}
} catch (Exception e) {
throw new IOException("Exception listing dir [" + path + "]", e);
}
if (!success) {
Preconditions.checkState(errStr != null);
throw new IOException(errStr); throw new IOException(errStr);
} }
} }
private KvCheckError checkContainerFile() { private void checkContainerFile() throws IOException {
/** /**
* compare the values in the container file loaded from disk, * compare the values in the container file loaded from disk,
* with the values we are expecting * with the values we are expecting
*/ */
KvCheckError error = KvCheckError.ERROR_NONE;
String dbType; String dbType;
Preconditions Preconditions
.checkState(onDiskContainerData != null, "Container File not loaded"); .checkState(onDiskContainerData != null, "Container File not loaded");
KvCheckAction next;
try {
ContainerUtils.verifyChecksum(onDiskContainerData); ContainerUtils.verifyChecksum(onDiskContainerData);
} catch (Exception e) {
error = KvCheckError.CONTAINERDATA_CKSUM;
handleCorruption("Container File Checksum mismatch", error, e);
return error;
}
if (onDiskContainerData.getContainerType() if (onDiskContainerData.getContainerType()
!= ContainerProtos.ContainerType.KeyValueContainer) { != ContainerProtos.ContainerType.KeyValueContainer) {
String errStr = "Bad Container type in Containerdata for " + containerID; String errStr = "Bad Container type in Containerdata for " + containerID;
error = KvCheckError.CONTAINERDATA_TYPE; throw new IOException(errStr);
handleCorruption(errStr, error, null);
return error; // Abort if we do not know the type of Container
} }
if (onDiskContainerData.getContainerID() != containerID) { if (onDiskContainerData.getContainerID() != containerID) {
String errStr = String errStr =
"Bad ContainerID field in Containerdata for " + containerID; "Bad ContainerID field in Containerdata for " + containerID;
error = KvCheckError.CONTAINERDATA_ID; throw new IOException(errStr);
next = handleCorruption(errStr, error, null);
if (next == KvCheckAction.ABORT) {
return error;
} // else continue checking other data elements
} }
dbType = onDiskContainerData.getContainerDBType(); dbType = onDiskContainerData.getContainerDBType();
@ -237,9 +195,7 @@ private KvCheckError checkContainerFile() {
!dbType.equals(OZONE_METADATA_STORE_IMPL_LEVELDB)) { !dbType.equals(OZONE_METADATA_STORE_IMPL_LEVELDB)) {
String errStr = "Unknown DBType [" + dbType String errStr = "Unknown DBType [" + dbType
+ "] in Container File for [" + containerID + "]"; + "] in Container File for [" + containerID + "]";
error = KvCheckError.CONTAINERDATA_DBTYPE; throw new IOException(errStr);
handleCorruption(errStr, error, null);
return error;
} }
KeyValueContainerData kvData = onDiskContainerData; KeyValueContainerData kvData = onDiskContainerData;
@ -248,17 +204,11 @@ private KvCheckError checkContainerFile() {
"Bad metadata path in Containerdata for " + containerID + "Expected [" "Bad metadata path in Containerdata for " + containerID + "Expected ["
+ metadataPath.toString() + "] Got [" + kvData.getMetadataPath() + metadataPath.toString() + "] Got [" + kvData.getMetadataPath()
+ "]"; + "]";
error = KvCheckError.CONTAINERDATA_METADATA_PATH; throw new IOException(errStr);
next = handleCorruption(errStr, error, null);
if (next == KvCheckAction.ABORT) {
return error;
} }
} }
return error; private void checkBlockDB() throws IOException {
}
private KvCheckError checkBlockDB() {
/** /**
* Check the integrity of the DB inside each container. * Check the integrity of the DB inside each container.
* In Scope: * In Scope:
@ -269,52 +219,31 @@ private KvCheckError checkBlockDB() {
* 1. chunk checksum verification. this is left to a separate * 1. chunk checksum verification. this is left to a separate
* slow chunk scanner * slow chunk scanner
*/ */
KvCheckError error;
Preconditions.checkState(onDiskContainerData != null, Preconditions.checkState(onDiskContainerData != null,
"invoke loadContainerData prior to calling this function"); "invoke loadContainerData prior to calling this function");
File dbFile; File dbFile;
File metaDir = new File(metadataPath); File metaDir = new File(metadataPath);
try {
dbFile = KeyValueContainerLocationUtil dbFile = KeyValueContainerLocationUtil
.getContainerDBFile(metaDir, containerID); .getContainerDBFile(metaDir, containerID);
if (!dbFile.exists() || !dbFile.canRead()) { if (!dbFile.exists() || !dbFile.canRead()) {
String dbFileErrorMsg = "Unable to access DB File [" + dbFile.toString() String dbFileErrorMsg = "Unable to access DB File [" + dbFile.toString()
+ "] for Container [" + containerID + "] metadata path [" + "] for Container [" + containerID + "] metadata path ["
+ metadataPath + "]"; + metadataPath + "]";
error = KvCheckError.DB_ACCESS; throw new IOException(dbFileErrorMsg);
handleCorruption(dbFileErrorMsg, error, null);
return error;
} }
} catch (Exception e) {
String dbFileErrorMessage =
"Exception when initializing DBFile" + "with metadatapath ["
+ metadataPath + "] for Container [" + containerID
+ "]";
error = KvCheckError.DB_ACCESS;
handleCorruption(dbFileErrorMessage, error, e);
return error;
}
onDiskContainerData.setDbFile(dbFile);
try {
onDiskContainerData.setDbFile(dbFile);
MetadataStore db = BlockUtils MetadataStore db = BlockUtils
.getDB(onDiskContainerData, checkConfig); .getDB(onDiskContainerData, checkConfig);
error = iterateBlockDB(db);
} catch (Exception e) { iterateBlockDB(db);
error = KvCheckError.DB_ITERATOR;
handleCorruption("Block DB Iterator aborted", error, e);
return error;
} }
return error; private void iterateBlockDB(MetadataStore db)
}
private KvCheckError iterateBlockDB(MetadataStore db)
throws IOException { throws IOException {
KvCheckError error = KvCheckError.ERROR_NONE;
Preconditions.checkState(db != null); Preconditions.checkState(db != null);
// get "normal" keys from the Block DB // get "normal" keys from the Block DB
@ -328,103 +257,39 @@ private KvCheckError iterateBlockDB(MetadataStore db)
List<ContainerProtos.ChunkInfo> chunkInfoList = block.getChunks(); List<ContainerProtos.ChunkInfo> chunkInfoList = block.getChunks();
for (ContainerProtos.ChunkInfo chunk : chunkInfoList) { for (ContainerProtos.ChunkInfo chunk : chunkInfoList) {
File chunkFile; File chunkFile;
try { chunkFile = ChunkUtils.getChunkFile(onDiskContainerData,
chunkFile = ChunkUtils
.getChunkFile(onDiskContainerData,
ChunkInfo.getFromProtoBuf(chunk)); ChunkInfo.getFromProtoBuf(chunk));
} catch (Exception e) {
error = KvCheckError.MISSING_CHUNK_FILE;
handleCorruption("Unable to access chunk path", error, e);
return error;
}
if (!chunkFile.exists()) { if (!chunkFile.exists()) {
error = KvCheckError.MISSING_CHUNK_FILE;
// concurrent mutation in Block DB? lookup the block again. // concurrent mutation in Block DB? lookup the block again.
byte[] bdata = db.get( byte[] bdata = db.get(
Longs.toByteArray(block.getBlockID().getLocalID())); Longs.toByteArray(block.getBlockID().getLocalID()));
if (bdata == null) { if (bdata == null) {
LOG.trace("concurrency with delete, ignoring deleted block"); LOG.trace("concurrency with delete, ignoring deleted block");
error = KvCheckError.ERROR_NONE;
break; // skip to next block from kvIter break; // skip to next block from kvIter
} else { } else {
handleCorruption("Missing chunk file", error, null); String errorStr = "Missing chunk file "
return error; + chunkFile.getAbsolutePath();
throw new IOException(errorStr);
}
} }
} }
} }
} }
return error; private void loadContainerData() throws IOException {
}
private KvCheckError loadContainerData() {
KvCheckError error = KvCheckError.ERROR_NONE;
File containerFile = KeyValueContainer File containerFile = KeyValueContainer
.getContainerFile(metadataPath.toString(), containerID); .getContainerFile(metadataPath.toString(), containerID);
try {
onDiskContainerData = (KeyValueContainerData) ContainerDataYaml onDiskContainerData = (KeyValueContainerData) ContainerDataYaml
.readContainerFile(containerFile); .readContainerFile(containerFile);
} catch (IOException e) {
error = KvCheckError.FILE_LOAD;
handleCorruption("Unable to load Container File", error, e);
} }
return error; private void handleCorruption(IOException e) {
}
private KvCheckAction handleCorruption(String reason,
KvCheckError error, Exception e) {
// XXX HDDS-1201 need to implement corruption handling/reporting
String errStr = String errStr =
"Corruption detected in container: [" + containerID + "] reason: [" "Corruption detected in container: [" + containerID + "] ";
+ reason + "] error code: [" + error + "]"; String logMessage = errStr + "Exception: [" + e.getMessage() + "]";
String logMessage = null;
StackTraceElement[] stackeElems = Thread.currentThread().getStackTrace();
String caller =
"Corruption reported from Source File: [" + stackeElems[2].getFileName()
+ "] Line: [" + stackeElems[2].getLineNumber() + "]";
if (e != null) {
logMessage = errStr + " exception: [" + e.getMessage() + "]";
e.printStackTrace();
} else {
logMessage = errStr;
}
LOG.error(caller);
LOG.error(logMessage); LOG.error(logMessage);
return KvCheckAction.ABORT;
}
/**
* Pre-defined error codes for Container Metadata check.
*/
public enum KvCheckError {
ERROR_NONE,
FILE_LOAD, // unable to load container metafile
METADATA_PATH_ACCESS, // metadata path is not accessible
CHUNKS_PATH_ACCESS, // chunks path is not accessible
CONTAINERDATA_ID, // bad Container-ID stored in Container file
CONTAINERDATA_METADATA_PATH, // bad metadata path in Container file
CONTAINERDATA_CHUNKS_PATH, // bad chunks path in Container file
CONTAINERDATA_CKSUM, // container file checksum mismatch
CONTAINERDATA_TYPE, // container file incorrect type of Container
CONTAINERDATA_DBTYPE, // unknown DB Type specified in Container File
DB_ACCESS, // unable to load Metastore DB
DB_ITERATOR, // unable to create block iterator for Metastore DB
MISSING_CHUNK_FILE // chunk file not found
}
private enum KvCheckAction {
CONTINUE, // Continue with remaining checks on the corrupt Container
ABORT // Abort checks for the container
} }
} }

View File

@ -55,6 +55,7 @@
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_METADATA_STORE_IMPL_LEVELDB; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_METADATA_STORE_IMPL_LEVELDB;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_METADATA_STORE_IMPL_ROCKSDB; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_METADATA_STORE_IMPL_ROCKSDB;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
/** /**
@ -100,7 +101,7 @@ public TestKeyValueContainerCheck(String metadataImpl) {
int deletedBlocks = 1; int deletedBlocks = 1;
int normalBlocks = 3; int normalBlocks = 3;
int chunksPerBlock = 4; int chunksPerBlock = 4;
KeyValueContainerCheck.KvCheckError error; boolean corruption = false;
// test Closed Container // test Closed Container
createContainerWithBlocks(containerID, normalBlocks, deletedBlocks, 65536, createContainerWithBlocks(containerID, normalBlocks, deletedBlocks, 65536,
@ -114,14 +115,14 @@ public TestKeyValueContainerCheck(String metadataImpl) {
containerID); containerID);
// first run checks on a Open Container // first run checks on a Open Container
error = kvCheck.fastCheck(); corruption = kvCheck.fastCheck();
assertTrue(error == KeyValueContainerCheck.KvCheckError.ERROR_NONE); assertFalse(corruption);
container.close(); container.close();
// next run checks on a Closed Container // next run checks on a Closed Container
error = kvCheck.fullCheck(); corruption = kvCheck.fullCheck();
assertTrue(error == KeyValueContainerCheck.KvCheckError.ERROR_NONE); assertFalse(corruption);
} }
/** /**