From ddbff7c8d3f1851e5c5fa9bc33637e859d7d8ccf Mon Sep 17 00:00:00 2001 From: Akira Ajisaka Date: Thu, 28 Jun 2018 14:58:40 +0900 Subject: [PATCH] HADOOP-14313. Replace/improve Hadoop's byte[] comparator. Contributed by Vikas Vishwakarma. --- NOTICE.txt | 8 ++++ .../apache/hadoop/io/FastByteComparisons.java | 44 +++++++------------ 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/NOTICE.txt b/NOTICE.txt index 95a670d9ee..a53f13c700 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -196,6 +196,14 @@ by Google Inc, which can be obtained at: * HOMEPAGE: * http://code.google.com/p/snappy/ +This product contains a modified portion of UnsignedBytes LexicographicalComparator +from Guava v21 project by Google Inc, which can be obtained at: + + * LICENSE: + * license/COPYING (Apache License 2.0) + * HOMEPAGE: + * https://github.com/google/guava + This product optionally depends on 'JBoss Marshalling', an alternative Java serialization API, which can be obtained at: diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java index a2903f89b9..5af6602b87 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java @@ -26,7 +26,6 @@ import org.slf4j.LoggerFactory; import sun.misc.Unsafe; -import com.google.common.primitives.Longs; import com.google.common.primitives.UnsignedBytes; /** @@ -195,52 +194,43 @@ public int compareTo(byte[] buffer1, int offset1, int length1, length1 == length2) { return 0; } + final int stride = 8; int minLength = Math.min(length1, length2); - int minWords = minLength / Longs.BYTES; + int strideLimit = minLength & ~(stride - 1); int offset1Adj = offset1 + BYTE_ARRAY_BASE_OFFSET; int offset2Adj = offset2 + BYTE_ARRAY_BASE_OFFSET; + int i; /* * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a * time is no slower than comparing 4 bytes at a time even on 32-bit. * On the other hand, it is substantially faster on 64-bit. */ - for (int i = 0; i < minWords * Longs.BYTES; i += Longs.BYTES) { + for (i = 0; i < strideLimit; i += stride) { long lw = theUnsafe.getLong(buffer1, offset1Adj + (long) i); long rw = theUnsafe.getLong(buffer2, offset2Adj + (long) i); - long diff = lw ^ rw; - if (diff != 0) { + if (lw != rw) { if (!littleEndian) { return lessThanUnsigned(lw, rw) ? -1 : 1; } - // Use binary search - int n = 0; - int y; - int x = (int) diff; - if (x == 0) { - x = (int) (diff >>> 32); - n = 32; - } - - y = x << 16; - if (y == 0) { - n += 16; - } else { - x = y; - } - - y = x << 8; - if (y == 0) { - n += 8; - } - return (int) (((lw >>> n) & 0xFFL) - ((rw >>> n) & 0xFFL)); + /* + * We want to compare only the first index where left[index] != + * right[index]. This corresponds to the least significant nonzero + * byte in lw ^ rw, since lw and rw are little-endian. + * Long.numberOfTrailingZeros(diff) tells us the least significant + * nonzero bit, and zeroing out the first three bits of L.nTZ gives + * us the shift to get that least significant nonzero byte. This + * comparison logic is based on UnsignedBytes from Guava v21 + */ + int n = Long.numberOfTrailingZeros(lw ^ rw) & ~0x7; + return ((int) ((lw >>> n) & 0xFF)) - ((int) ((rw >>> n) & 0xFF)); } } // The epilogue to cover the last (minLength % 8) elements. - for (int i = minWords * Longs.BYTES; i < minLength; i++) { + for (; i < minLength; i++) { int result = UnsignedBytes.compare( buffer1[offset1 + i], buffer2[offset2 + i]);