HADOOP-14313. Replace/improve Hadoop's byte[] comparator. Contributed by Vikas Vishwakarma.

2018-06-28 14:58:40 +09:00 · 2018-06-28 14:58:40 +09:00 · ddbff7c8d3
commit ddbff7c8d3
parent 2b2399d623
2 changed files with 25 additions and 27 deletions
--- a/NOTICE.txt
+++ b/NOTICE.txt
@ -196,6 +196,14 @@ by Google Inc, which can be obtained at:
  * HOMEPAGE:
    * http://code.google.com/p/snappy/
 This product contains a modified portion of UnsignedBytes LexicographicalComparator
 from Guava v21 project by Google Inc, which can be obtained at:
  * LICENSE:
    * license/COPYING (Apache License 2.0)
  * HOMEPAGE:
    * https://github.com/google/guava
 This product optionally depends on 'JBoss Marshalling', an alternative Java
 serialization API, which can be obtained at:
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java
@ -26,7 +26,6 @@
 import org.slf4j.LoggerFactory;
 import sun.misc.Unsafe;
 import com.google.common.primitives.Longs;
 import com.google.common.primitives.UnsignedBytes;
 /**
@ -195,52 +194,43 @@ public int compareTo(byte[] buffer1, int offset1, int length1,
            length1 == length2) {
          return 0;
        }
        final int stride = 8;
        int minLength = Math.min(length1, length2);
-        int minWords = minLength / Longs.BYTES;
+        int strideLimit = minLength & ~(stride - 1);
        int offset1Adj = offset1 + BYTE_ARRAY_BASE_OFFSET;
        int offset2Adj = offset2 + BYTE_ARRAY_BASE_OFFSET;
        int i;
        /*
         * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a
         * time is no slower than comparing 4 bytes at a time even on 32-bit.
         * On the other hand, it is substantially faster on 64-bit.
         */
-        for (int i = 0; i < minWords * Longs.BYTES; i += Longs.BYTES) {
+        for (i = 0; i < strideLimit; i += stride) {
          long lw = theUnsafe.getLong(buffer1, offset1Adj + (long) i);
          long rw = theUnsafe.getLong(buffer2, offset2Adj + (long) i);
          long diff = lw ^ rw;
-          if (diff != 0) {
+          if (lw != rw) {
            if (!littleEndian) {
              return lessThanUnsigned(lw, rw) ? -1 : 1;
            }
-            // Use binary search
+            /*
-            int n = 0;
+             * We want to compare only the first index where left[index] !=
-            int y;
+             * right[index]. This corresponds to the least significant nonzero
-            int x = (int) diff;
+             * byte in lw ^ rw, since lw and rw are little-endian.
-            if (x == 0) {
+             * Long.numberOfTrailingZeros(diff) tells us the least significant
-              x = (int) (diff >>> 32);
+             * nonzero bit, and zeroing out the first three bits of L.nTZ gives
-              n = 32;
+             * us the shift to get that least significant nonzero byte. This
-            }
+             * comparison logic is based on UnsignedBytes from Guava v21
-
+             */
-            y = x << 16;
+            int n = Long.numberOfTrailingZeros(lw ^ rw) & ~0x7;
-            if (y == 0) {
+            return ((int) ((lw >>> n) & 0xFF)) - ((int) ((rw >>> n) & 0xFF));
              n += 16;
            } else {
              x = y;
            }
            y = x << 8;
            if (y == 0) {
              n += 8;
            }
            return (int) (((lw >>> n) & 0xFFL) - ((rw >>> n) & 0xFFL));
          }
        }
        // The epilogue to cover the last (minLength % 8) elements.
-        for (int i = minWords * Longs.BYTES; i < minLength; i++) {
+        for (; i < minLength; i++) {
          int result = UnsignedBytes.compare(
              buffer1[offset1 + i],
              buffer2[offset2 + i]);