From 0e75187199076b7fc5a448bb8a143d3ae32838d3 Mon Sep 17 00:00:00 2001 From: Todd Lipcon Date: Thu, 27 Oct 2011 05:59:05 +0000 Subject: [PATCH] HADOOP-7761. Improve the performance of raw comparisons. Contributed by Todd Lipcon. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1189613 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-common/CHANGES.txt | 4 + .../apache/hadoop/io/FastByteComparisons.java | 237 ++++++++++++++++++ .../apache/hadoop/io/WritableComparator.java | 11 +- 3 files changed, 242 insertions(+), 10 deletions(-) create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index 8887badd2d..154f886ea6 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -93,6 +93,10 @@ Trunk (unreleased changes) HADOOP-7766. The auth to local mappings are not being respected, with webhdfs and security enabled. (jitendra) + OPTIMIZATIONS + + HADOOP-7761. Improve the performance of raw comparisons. (todd) + Release 0.23.0 - Unreleased INCOMPATIBLE CHANGES diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java new file mode 100644 index 0000000000..3f5881b2dd --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java @@ -0,0 +1,237 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.io; + +import java.lang.reflect.Field; +import java.nio.ByteOrder; +import java.security.AccessController; +import java.security.PrivilegedAction; + +import sun.misc.Unsafe; + +import com.google.common.primitives.Longs; +import com.google.common.primitives.UnsignedBytes; + +/** + * Utility code to do optimized byte-array comparison. + * This is borrowed and slightly modified from Guava's {@link UnsignedBytes} + * class to be able to compare arrays that start at non-zero offsets. + */ +abstract class FastByteComparisons { + + /** + * Lexicographically compare two byte arrays. + */ + public static int compareTo(byte[] b1, int s1, int l1, byte[] b2, int s2, + int l2) { + return LexicographicalComparerHolder.BEST_COMPARER.compareTo( + b1, s1, l1, b2, s2, l2); + } + + + private interface Comparer { + abstract public int compareTo(T buffer1, int offset1, int length1, + T buffer2, int offset2, int length2); + } + + private static Comparer lexicographicalComparerJavaImpl() { + return LexicographicalComparerHolder.PureJavaComparer.INSTANCE; + } + + + /** + * Provides a lexicographical comparer implementation; either a Java + * implementation or a faster implementation based on {@link Unsafe}. + * + *

Uses reflection to gracefully fall back to the Java implementation if + * {@code Unsafe} isn't available. + */ + private static class LexicographicalComparerHolder { + static final String UNSAFE_COMPARER_NAME = + LexicographicalComparerHolder.class.getName() + "$UnsafeComparer"; + + static final Comparer BEST_COMPARER = getBestComparer(); + /** + * Returns the Unsafe-using Comparer, or falls back to the pure-Java + * implementation if unable to do so. + */ + static Comparer getBestComparer() { + try { + Class theClass = Class.forName(UNSAFE_COMPARER_NAME); + + // yes, UnsafeComparer does implement Comparer + @SuppressWarnings("unchecked") + Comparer comparer = + (Comparer) theClass.getEnumConstants()[0]; + return comparer; + } catch (Throwable t) { // ensure we really catch *everything* + return lexicographicalComparerJavaImpl(); + } + } + + private enum PureJavaComparer implements Comparer { + INSTANCE; + + @Override + public int compareTo(byte[] buffer1, int offset1, int length1, + byte[] buffer2, int offset2, int length2) { + // Short circuit equal case + if (buffer1 == buffer2 && + offset1 == offset2 && + length1 == length2) { + return 0; + } + // Bring WritableComparator code local + int end1 = offset1 + length1; + int end2 = offset2 + length2; + for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) { + int a = (buffer1[i] & 0xff); + int b = (buffer2[j] & 0xff); + if (a != b) { + return a - b; + } + } + return length1 - length2; + } + } + + @SuppressWarnings("unused") // used via reflection + private enum UnsafeComparer implements Comparer { + INSTANCE; + + static final Unsafe theUnsafe; + + /** The offset to the first element in a byte array. */ + static final int BYTE_ARRAY_BASE_OFFSET; + + static { + theUnsafe = (Unsafe) AccessController.doPrivileged( + new PrivilegedAction() { + @Override + public Object run() { + try { + Field f = Unsafe.class.getDeclaredField("theUnsafe"); + f.setAccessible(true); + return f.get(null); + } catch (NoSuchFieldException e) { + // It doesn't matter what we throw; + // it's swallowed in getBestComparer(). + throw new Error(); + } catch (IllegalAccessException e) { + throw new Error(); + } + } + }); + + BYTE_ARRAY_BASE_OFFSET = theUnsafe.arrayBaseOffset(byte[].class); + + // sanity check - this should never fail + if (theUnsafe.arrayIndexScale(byte[].class) != 1) { + throw new AssertionError(); + } + } + + static final boolean littleEndian = + ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN); + + /** + * Returns true if x1 is less than x2, when both values are treated as + * unsigned. + */ + static boolean lessThanUnsigned(long x1, long x2) { + return (x1 + Long.MIN_VALUE) < (x2 + Long.MIN_VALUE); + } + + /** + * Lexicographically compare two arrays. + * + * @param buffer1 left operand + * @param buffer2 right operand + * @param offset1 Where to start comparing in the left buffer + * @param offset2 Where to start comparing in the right buffer + * @param length1 How much to compare from the left buffer + * @param length2 How much to compare from the right buffer + * @return 0 if equal, < 0 if left is less than right, etc. + */ + @Override + public int compareTo(byte[] buffer1, int offset1, int length1, + byte[] buffer2, int offset2, int length2) { + // Short circuit equal case + if (buffer1 == buffer2 && + offset1 == offset2 && + length1 == length2) { + return 0; + } + int minLength = Math.min(length1, length2); + int minWords = minLength / Longs.BYTES; + int offset1Adj = offset1 + BYTE_ARRAY_BASE_OFFSET; + int offset2Adj = offset2 + BYTE_ARRAY_BASE_OFFSET; + + /* + * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a + * time is no slower than comparing 4 bytes at a time even on 32-bit. + * On the other hand, it is substantially faster on 64-bit. + */ + for (int i = 0; i < minWords * Longs.BYTES; i += Longs.BYTES) { + long lw = theUnsafe.getLong(buffer1, offset1Adj + (long) i); + long rw = theUnsafe.getLong(buffer2, offset2Adj + (long) i); + long diff = lw ^ rw; + + if (diff != 0) { + if (!littleEndian) { + return lessThanUnsigned(lw, rw) ? -1 : 1; + } + + // Use binary search + int n = 0; + int y; + int x = (int) diff; + if (x == 0) { + x = (int) (diff >>> 32); + n = 32; + } + + y = x << 16; + if (y == 0) { + n += 16; + } else { + x = y; + } + + y = x << 8; + if (y == 0) { + n += 8; + } + return (int) (((lw >>> n) & 0xFFL) - ((rw >>> n) & 0xFFL)); + } + } + + // The epilogue to cover the last (minLength % 8) elements. + for (int i = minWords * Longs.BYTES; i < minLength; i++) { + int result = UnsignedBytes.compare( + buffer1[offset1 + i], + buffer2[offset2 + i]); + if (result != 0) { + return result; + } + } + return length1 - length2; + } + } + } +} \ No newline at end of file diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableComparator.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableComparator.java index cab52cf2bd..6eb3a21443 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableComparator.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableComparator.java @@ -151,16 +151,7 @@ public int compare(Object a, Object b) { /** Lexicographic order of binary data. */ public static int compareBytes(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { - int end1 = s1 + l1; - int end2 = s2 + l2; - for (int i = s1, j = s2; i < end1 && j < end2; i++, j++) { - int a = (b1[i] & 0xff); - int b = (b2[j] & 0xff); - if (a != b) { - return a - b; - } - } - return l1 - l2; + return FastByteComparisons.compareTo(b1, s1, l1, b2, s2, l2); } /** Compute hash for binary data. */