From 7ecaa81d2746a805b9937fcd134ceaa8607ed7b5 Mon Sep 17 00:00:00 2001 From: Binglin Chang Date: Wed, 6 Aug 2014 06:01:12 +0000 Subject: [PATCH] MAPREDUCE-5984. native-task: Reuse lz4 sources in hadoop-common. Contributed by Binglin Chang git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/MR-2841@1616105 13f79535-47bb-0310-9956-ffa450edef68 --- .../CHANGES.MAPREDUCE-2841.txt | 1 + .../pom.xml | 6 + .../src/CMakeLists.txt | 4 +- .../src/main/native/COPYING | 12 +- .../src/main/native/lz4/lz4.c | 740 ------------------ .../src/main/native/lz4/lz4.h | 96 --- .../src/main/native/src/codec/Lz4Codec.cc | 22 +- .../src/main/native/test/TestCompressions.cc | 6 +- .../src/main/native/test/TestIFile.cc | 2 +- 9 files changed, 21 insertions(+), 868 deletions(-) delete mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/lz4/lz4.c delete mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/lz4/lz4.h diff --git a/hadoop-mapreduce-project/CHANGES.MAPREDUCE-2841.txt b/hadoop-mapreduce-project/CHANGES.MAPREDUCE-2841.txt index c70df12b0c..9dddcd5d12 100644 --- a/hadoop-mapreduce-project/CHANGES.MAPREDUCE-2841.txt +++ b/hadoop-mapreduce-project/CHANGES.MAPREDUCE-2841.txt @@ -9,3 +9,4 @@ MAPREDUCE-6000. native-task: Simplify ByteBufferDataReader/Writer (todd) MAPREDUCE-5991. native-task should not run unit tests if native profile is not enabled. (Binglin Chang) MAPREDUCE-5995. native-task: Revert changes to Text internals (todd) MAPREDUCE-6005. native-task: Fix some valgrind errors (Binglin Chang) +MAPREDUCE-5984. native-task: Reuse lz4 sources in hadoop-common (Binglin Chang) diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/pom.xml index 2cb483e1db..9727800b4e 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/pom.xml +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/pom.xml @@ -163,6 +163,12 @@ + + + diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/CMakeLists.txt b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/CMakeLists.txt index 309416234b..36dbd9c306 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/CMakeLists.txt +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/CMakeLists.txt @@ -155,7 +155,7 @@ include_directories( ${D}/test ${CMAKE_CURRENT_SOURCE_DIR} #${CMAKE_CURRENT_SOURCE_DIR}/src - #${CMAKE_BINARY_DIR} + ${CMAKE_BINARY_DIR} ${JNI_INCLUDE_DIRS} ${SNAPPY_INCLUDE_DIR} ) @@ -174,7 +174,7 @@ else (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") endif (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") add_dual_library(nativetask - ${D}/lz4/lz4.c + ${CMAKE_BINARY_DIR}/lz4.c ${D}/cityhash/city.cc ${D}/src/codec/BlockCodec.cc ${D}/src/codec/GzipCodec.cc diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/COPYING b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/COPYING index c5d342079d..1d60d1d8ed 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/COPYING +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/COPYING @@ -56,20 +56,20 @@ LZ4 --------------------------------------------------------------------- LZ4 - Fast LZ compression algorithm Header File - Copyright (C) 2011, Yann Collet. - BSD License + Copyright (C) 2011-2014, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -81,3 +81,7 @@ LZ4 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : http://code.google.com/p/lz4/ + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c \ No newline at end of file diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/lz4/lz4.c b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/lz4/lz4.c deleted file mode 100644 index 345a436b3f..0000000000 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/lz4/lz4.c +++ /dev/null @@ -1,740 +0,0 @@ -/* - LZ4 - Fast LZ compression algorithm - Copyright (C) 2011-2012, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -//************************************** -// Compilation Directives -//************************************** -#if __STDC_VERSION__ >= 199901L - /* "restrict" is a known keyword */ -#else -#define restrict // Disable restrict -#endif - -#ifdef _MSC_VER -#define inline __forceinline -#endif - -#ifdef __GNUC__ -#define _PACKED __attribute__ ((packed)) -#else -#define _PACKED -#endif - -#if (__x86_64__ || __ppc64__ || _WIN64 || __LP64__) // Detect 64 bits mode -#define ARCH64 1 -#else -#define ARCH64 0 -#endif - - -//************************************** -// Includes -//************************************** -#include // for malloc -#include // for memset -#include "lz4.h" - - -//************************************** -// Performance parameter -//************************************** -// Increasing this value improves compression ratio -// Lowering this value reduces memory usage -// Lowering may also improve speed, typically on reaching cache size limits (L1 32KB for Intel, 64KB for AMD) -// Memory usage formula for 32 bits systems : N->2^(N+2) Bytes (examples : 17 -> 512KB ; 12 -> 16KB) -#define HASH_LOG 12 - -//#define _FORCE_SW_BITCOUNT // Uncomment for better performance if target platform has no hardware support for LowBitCount - - -//************************************** -// Basic Types -//************************************** -#if defined(_MSC_VER) // Visual Studio does not support 'stdint' natively -#define BYTE unsigned __int8 -#define U16 unsigned __int16 -#define U32 unsigned __int32 -#define S32 __int32 -#define U64 unsigned __int64 -#else -#include -#define BYTE uint8_t -#define U16 uint16_t -#define U32 uint32_t -#define S32 int32_t -#define U64 uint64_t -#endif - - -//************************************** -// Constants -//************************************** -#define MINMATCH 4 -#define SKIPSTRENGTH 6 -#define STACKLIMIT 13 -#define HEAPMODE (HASH_LOG>STACKLIMIT) // Defines if memory is allocated into the stack (local variable), or into the heap (malloc()). -#define COPYLENGTH 8 -#define LASTLITERALS 5 -#define MFLIMIT (COPYLENGTH+MINMATCH) -#define MINLENGTH (MFLIMIT+1) - -#define MAXD_LOG 16 -#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) - -#define HASHTABLESIZE (1 << HASH_LOG) -#define HASH_MASK (HASHTABLESIZE - 1) - -#define ML_BITS 4 -#define ML_MASK ((1U<v) -#define A32(x) (((U32_S *)(x))->v) -#define A16(x) (((U16_S *)(x))->v) - - -//************************************** -// Architecture-specific macros -//************************************** -#if ARCH64 // 64-bit -#define STEPSIZE 8 -#define UARCH U64 -#define AARCH A64 -#define LZ4_COPYSTEP(s,d) A64(d) = A64(s); d+=8; s+=8; -#define LZ4_COPYPACKET(s,d) LZ4_COPYSTEP(s,d) -#define LZ4_SECURECOPY(s,d,e) if (d>8; } -#define LZ4_NbCommonBytes LZ4_NbCommonBytes_BigEndian -#endif - - -//************************************** -// Macros -//************************************** -#define LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8)-HASH_LOG)) -#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p)) -#define LZ4_WILDCOPY(s,d,e) do { LZ4_COPYPACKET(s,d) } while (d>3); - #elif defined(__GNUC__) && !defined(_FORCE_SW_BITCOUNT) - return (__builtin_ctzll(val) >> 3); - #else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -val) * 0x0218A392CDABBD3F)) >> 58]; - #endif -} - -inline static int LZ4_NbCommonBytes_BigEndian (register U64 val) -{ - #if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (int)(r>>3); - #elif defined(__GNUC__) && !defined(_FORCE_SW_BITCOUNT) - return (__builtin_clzll(val) >> 3); - #else - int r; - if (!(val>>32)) { r=4; } else { r=0; val>>=32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; - #endif -} - -#else - -inline static int LZ4_NbCommonBytes_LittleEndian (register U32 val) -{ - #if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanForward( &r, val ); - return (int)(r>>3); - #elif defined(__GNUC__) && !defined(_FORCE_SW_BITCOUNT) - return (__builtin_ctz(val) >> 3); - #else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -val) * 0x077CB531U)) >> 27]; - #endif -} - -inline static int LZ4_NbCommonBytes_BigEndian (register U32 val) -{ - #if defined(_MSC_VER) && !defined(_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse( &r, val ); - return (int)(r>>3); - #elif defined(__GNUC__) && !defined(_FORCE_SW_BITCOUNT) - return (__builtin_clz(val) >> 3); - #else - int r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; - #endif -} - -#endif - - -//****************************** -// Public Compression functions -//****************************** - -int LZ4_compressCtx(void** ctx, - char* source, - char* dest, - int isize) -{ -#if HEAPMODE - struct refTables *srt = (struct refTables *) (*ctx); - HTYPE* HashTable; -#else - HTYPE HashTable[HASHTABLESIZE] = {0}; -#endif - - const BYTE* ip = (BYTE*) source; - INITBASE(base); - const BYTE* anchor = ip; - const BYTE* const iend = ip + isize; - const BYTE* const mflimit = iend - MFLIMIT; -#define matchlimit (iend - LASTLITERALS) - - BYTE* op = (BYTE*) dest; - - int len, length; - const int skipStrength = SKIPSTRENGTH; - U32 forwardH; - - - // Init - if (isizehashTable); - memset((void*)HashTable, 0, sizeof(srt->hashTable)); -#else - (void) ctx; -#endif - - - // First Byte - HashTable[LZ4_HASH_VALUE(ip)] = ip - base; - ip++; forwardH = LZ4_HASH_VALUE(ip); - - // Main Loop - for ( ; ; ) - { - int findMatchAttempts = (1U << skipStrength) + 3; - const BYTE* forwardIp = ip; - const BYTE* ref; - BYTE* token; - - // Find a match - do { - U32 h = forwardH; - int step = findMatchAttempts++ >> skipStrength; - ip = forwardIp; - forwardIp = ip + step; - - if (forwardIp > mflimit) { goto _last_literals; } - - forwardH = LZ4_HASH_VALUE(forwardIp); - ref = base + HashTable[h]; - HashTable[h] = ip - base; - - } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); - - // Catch up - while ((ip>anchor) && (ref>(BYTE*)source) && (ip[-1]==ref[-1])) { ip--; ref--; } - - // Encode Literal length - length = ip - anchor; - token = op++; - if (length>=(int)RUN_MASK) { *token=(RUN_MASK< 254 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } - else *token = (length<=(int)ML_MASK) { *token+=ML_MASK; len-=ML_MASK; for(; len > 509 ; len-=510) { *op++ = 255; *op++ = 255; } if (len > 254) { len-=255; *op++ = 255; } *op++ = (BYTE)len; } - else *token += len; - - // Test end of chunk - if (ip > mflimit) { anchor = ip; break; } - - // Fill table - HashTable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; - - // Test next position - ref = base + HashTable[LZ4_HASH_VALUE(ip)]; - HashTable[LZ4_HASH_VALUE(ip)] = ip - base; - if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; } - - // Prepare next loop - anchor = ip++; - forwardH = LZ4_HASH_VALUE(ip); - } - -_last_literals: - // Encode Last Literals - { - int lastRun = iend - anchor; - if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } - else *op++ = (lastRun<> ((MINMATCH*8)-HASHLOG64K)) -#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p)) -int LZ4_compress64kCtx(void** ctx, - char* source, - char* dest, - int isize) -{ -#if HEAPMODE - struct refTables *srt = (struct refTables *) (*ctx); - U16* HashTable; -#else - U16 HashTable[HASH64KTABLESIZE] = {0}; -#endif - - const BYTE* ip = (BYTE*) source; - const BYTE* anchor = ip; - const BYTE* const base = ip; - const BYTE* const iend = ip + isize; - const BYTE* const mflimit = iend - MFLIMIT; -#define matchlimit (iend - LASTLITERALS) - - BYTE* op = (BYTE*) dest; - - int len, length; - const int skipStrength = SKIPSTRENGTH; - U32 forwardH; - - - // Init - if (isizehashTable); - memset((void*)HashTable, 0, sizeof(srt->hashTable)); -#else - (void) ctx; -#endif - - - // First Byte - ip++; forwardH = LZ4_HASH64K_VALUE(ip); - - // Main Loop - for ( ; ; ) - { - int findMatchAttempts = (1U << skipStrength) + 3; - const BYTE* forwardIp = ip; - const BYTE* ref; - BYTE* token; - - // Find a match - do { - U32 h = forwardH; - int step = findMatchAttempts++ >> skipStrength; - ip = forwardIp; - forwardIp = ip + step; - - if (forwardIp > mflimit) { goto _last_literals; } - - forwardH = LZ4_HASH64K_VALUE(forwardIp); - ref = base + HashTable[h]; - HashTable[h] = ip - base; - - } while (A32(ref) != A32(ip)); - - // Catch up - while ((ip>anchor) && (ref>(BYTE*)source) && (ip[-1]==ref[-1])) { ip--; ref--; } - - // Encode Literal length - length = ip - anchor; - token = op++; - if (length>=(int)RUN_MASK) { *token=(RUN_MASK< 254 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } - else *token = (length<=(int)ML_MASK) { *token+=ML_MASK; len-=ML_MASK; for(; len > 509 ; len-=510) { *op++ = 255; *op++ = 255; } if (len > 254) { len-=255; *op++ = 255; } *op++ = (BYTE)len; } - else *token += len; - - // Test end of chunk - if (ip > mflimit) { anchor = ip; break; } - - // Fill table - HashTable[LZ4_HASH64K_VALUE(ip-2)] = ip - 2 - base; - - // Test next position - ref = base + HashTable[LZ4_HASH64K_VALUE(ip)]; - HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base; - if (A32(ref) == A32(ip)) { token = op++; *token=0; goto _next_match; } - - // Prepare next loop - anchor = ip++; - forwardH = LZ4_HASH64K_VALUE(ip); - } - -_last_literals: - // Encode Last Literals - { - int lastRun = iend - anchor; - if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } - else *op++ = (lastRun<>ML_BITS)) == RUN_MASK) { for (;(len=*ip++)==255;length+=255){} length += len; } - - // copy literals - cpy = op+length; - if (cpy>oend-COPYLENGTH) - { - if (cpy > oend) goto _output_error; - memcpy(op, ip, length); - ip += length; - break; // Necessarily EOF - } - LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy; - - // get offset - LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; - if (ref < (BYTE* const)dest) goto _output_error; - - // get matchlength - if ((length=(token&ML_MASK)) == ML_MASK) { for (;*ip==255;length+=255) {ip++;} length += *ip++; } - - // copy repeated sequence - if (op-refoend-COPYLENGTH) - { - if (cpy > oend) goto _output_error; - LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH)); - while(op>ML_BITS)) == RUN_MASK) { for (;(len=*ip++)==255;length+=255){} length += len; } - - // copy literals - cpy = op+length; - if (cpy>oend-COPYLENGTH) - { - if (cpy > oend) goto _output_error; - memcpy(op, ip, length); - op += length; - break; // Necessarily EOF - } - LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy; - if (ip>=iend) break; // check EOF - - // get offset - LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; - if (ref < (BYTE* const)dest) goto _output_error; - - // get matchlength - if ((length=(token&ML_MASK)) == ML_MASK) { for (;(len=*ip++)==255;length+=255){} length += len; } - - // copy repeated sequence - if (op-refoend-COPYLENGTH) - { - if (cpy > oend) goto _output_error; - LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH)); - while(op