From 5839aa01c88c4fc6d2bc4860193789d19ff5e79f Mon Sep 17 00:00:00 2001 From: Todd Lipcon Date: Fri, 28 Oct 2011 22:51:47 +0000 Subject: [PATCH] HADOOP-7446. Implement CRC32C native code using SSE4.2 instructions. Contributed by Kihwal Lee and Todd Lipcon. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1190650 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-common/CHANGES.txt | 3 + .../src/org/apache/hadoop/util/NativeCrc32.c | 5 + .../src/org/apache/hadoop/util/bulk_crc32.c | 508 +++++++++++++++++- 3 files changed, 508 insertions(+), 8 deletions(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index 04139df529..46fda021de 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -520,6 +520,9 @@ Release 0.23.0 - Unreleased HADOOP-7753. Support fadvise and sync_file_range in NativeIO. Add ReadaheadPool infrastructure for use in HDFS and MR. (todd) + HADOOP-7446. Implement CRC32C native code using SSE4.2 instructions. + (Kihwal Lee and todd via todd) + BUG FIXES HADOOP-7740. Fixed security audit logger configuration. (Arpit Gupta via Eric Yang) diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/NativeCrc32.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/NativeCrc32.c index cba6c7c53e..869c2ba2e8 100644 --- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/NativeCrc32.c +++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/NativeCrc32.c @@ -124,6 +124,11 @@ JNIEXPORT void JNICALL Java_org_apache_hadoop_util_NativeCrc32_nativeVerifyChunk "bad offsets or lengths"); return; } + if (unlikely(bytes_per_checksum) <= 0) { + THROW(env, "java/lang/IllegalArgumentException", + "invalid bytes_per_checksum"); + return; + } uint32_t *sums = (uint32_t *)(sums_addr + sums_offset); uint8_t *data = data_addr + data_offset; diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c index 4a769b41ed..2f7a0d5a59 100644 --- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c +++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c @@ -21,6 +21,7 @@ * All rights reserved. Use of this source code is governed by a * BSD-style license that can be found in the LICENSE file. */ +#include #include #include #include @@ -30,47 +31,124 @@ #include "bulk_crc32.h" #include "gcc_optimizations.h" +#define USE_PIPELINED + typedef uint32_t (*crc_update_func_t)(uint32_t, const uint8_t *, size_t); static uint32_t crc_init(); static uint32_t crc_val(uint32_t crc); static uint32_t crc32_zlib_sb8(uint32_t crc, const uint8_t *buf, size_t length); static uint32_t crc32c_sb8(uint32_t crc, const uint8_t *buf, size_t length); +#ifdef USE_PIPELINED +static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks); +#endif USE_PIPELINED +static int cached_cpu_supports_crc32; // initialized by constructor below +static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* data, size_t length); + int bulk_verify_crc(const uint8_t *data, size_t data_len, const uint32_t *sums, int checksum_type, int bytes_per_checksum, crc32_error_t *error_info) { +#ifdef USE_PIPELINED + uint32_t crc1, crc2, crc3; + int n_blocks = data_len / bytes_per_checksum; + int remainder = data_len % bytes_per_checksum; + int do_pipelined = 0; +#endif + uint32_t crc; crc_update_func_t crc_update_func; switch (checksum_type) { case CRC32_ZLIB_POLYNOMIAL: crc_update_func = crc32_zlib_sb8; break; case CRC32C_POLYNOMIAL: - crc_update_func = crc32c_sb8; + if (likely(cached_cpu_supports_crc32)) { + crc_update_func = crc32c_hardware; +#ifdef USE_PIPELINED + do_pipelined = 1; +#endif + } else { + crc_update_func = crc32c_sb8; + } break; default: return INVALID_CHECKSUM_TYPE; } +#ifdef USE_PIPELINED + if (do_pipelined) { + /* Process three blocks at a time */ + while (likely(n_blocks >= 3)) { + crc1 = crc2 = crc3 = crc_init(); + pipelined_crc32c(&crc1, &crc2, &crc3, data, bytes_per_checksum, 3); + + crc = ntohl(crc_val(crc1)); + if ((crc = ntohl(crc_val(crc1))) != *sums) + goto return_crc_error; + sums++; + data += bytes_per_checksum; + if ((crc = ntohl(crc_val(crc2))) != *sums) + goto return_crc_error; + sums++; + data += bytes_per_checksum; + if ((crc = ntohl(crc_val(crc3))) != *sums) + goto return_crc_error; + sums++; + data += bytes_per_checksum; + n_blocks -= 3; + } + + /* One or two blocks */ + if (n_blocks) { + crc1 = crc2 = crc_init(); + pipelined_crc32c(&crc1, &crc2, &crc3, data, bytes_per_checksum, n_blocks); + + if ((crc = ntohl(crc_val(crc1))) != *sums) + goto return_crc_error; + data += bytes_per_checksum; + sums++; + if (n_blocks == 2) { + if ((crc = ntohl(crc_val(crc2))) != *sums) + goto return_crc_error; + sums++; + data += bytes_per_checksum; + } + } + + /* For something smaller than a block */ + if (remainder) { + crc1 = crc_init(); + pipelined_crc32c(&crc1, &crc2, &crc3, data, remainder, 1); + + if ((crc = ntohl(crc_val(crc1))) != *sums) + goto return_crc_error; + } + return CHECKSUMS_VALID; + } +#endif + while (likely(data_len > 0)) { int len = likely(data_len >= bytes_per_checksum) ? bytes_per_checksum : data_len; - uint32_t crc = crc_init(); + crc = crc_init(); crc = crc_update_func(crc, data, len); crc = ntohl(crc_val(crc)); if (unlikely(crc != *sums)) { - if (error_info != NULL) { - error_info->got_crc = crc; - error_info->expected_crc = *sums; - error_info->bad_data = data; - } - return INVALID_CHECKSUM_DETECTED; + goto return_crc_error; } data += len; data_len -= len; sums++; } return CHECKSUMS_VALID; + +return_crc_error: + if (error_info != NULL) { + error_info->got_crc = crc; + error_info->expected_crc = *sums; + error_info->bad_data = data; + } + return INVALID_CHECKSUM_DETECTED; } @@ -154,3 +232,417 @@ static uint32_t crc32_zlib_sb8( } return crc; } + +/////////////////////////////////////////////////////////////////////////// +// Begin code for SSE4.2 specific hardware support of CRC32C +/////////////////////////////////////////////////////////////////////////// + +#if (defined(__amd64__) || defined(__i386)) && defined(__GNUC__) +# define SSE42_FEATURE_BIT (1 << 20) +# define CPUID_FEATURES 1 +/** + * Call the cpuid instruction to determine CPU feature flags. + */ +static uint32_t cpuid(uint32_t eax_in) { + uint32_t eax, ebx, ecx, edx; +# if defined(__PIC__) && !defined(__LP64__) +// 32-bit PIC code uses the ebx register for the base offset -- +// have to save and restore it on the stack + asm("pushl %%ebx\n\t" + "cpuid\n\t" + "movl %%ebx, %[ebx]\n\t" + "popl %%ebx" : "=a" (eax), [ebx] "=r"(ebx), "=c"(ecx), "=d"(edx) : "a" (eax_in) + : "cc"); +# else + asm("cpuid" : "=a" (eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(eax_in) + : "cc"); +# endif + + return ecx; +} + +/** + * On library load, initiailize the cached value above for + * whether the cpu supports SSE4.2's crc32 instruction. + */ +void __attribute__ ((constructor)) init_cpu_support_flag(void) { + uint32_t ecx = cpuid(CPUID_FEATURES); + cached_cpu_supports_crc32 = ecx & SSE42_FEATURE_BIT; +} + + +// +// Definitions of the SSE4.2 crc32 operations. Using these instead of +// the GCC __builtin_* intrinsics allows this code to compile without +// -msse4.2, since we do dynamic CPU detection at runtime. +// + +# ifdef __LP64__ +inline uint64_t _mm_crc32_u64(uint64_t crc, uint64_t value) { + asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} +# endif + +inline uint32_t _mm_crc32_u32(uint32_t crc, uint32_t value) { + asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +inline uint32_t _mm_crc32_u16(uint32_t crc, uint16_t value) { + asm("crc32w %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +inline uint32_t _mm_crc32_u8(uint32_t crc, uint8_t value) { + asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + + +# ifdef __LP64__ +/** + * Hardware-accelerated CRC32C calculation using the 64-bit instructions. + */ +static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* p_buf, size_t length) { + // start directly at p_buf, even if it's an unaligned address. According + // to the original author of this code, doing a small run of single bytes + // to word-align the 64-bit instructions doesn't seem to help, but + // we haven't reconfirmed those benchmarks ourselves. + uint64_t crc64bit = crc; + size_t i; + for (i = 0; i < length / sizeof(uint64_t); i++) { + crc64bit = _mm_crc32_u64(crc64bit, *(uint64_t*) p_buf); + p_buf += sizeof(uint64_t); + } + + // This ugly switch is slightly faster for short strings than the straightforward loop + uint32_t crc32bit = (uint32_t) crc64bit; + length &= sizeof(uint64_t) - 1; + switch (length) { + case 7: + crc32bit = _mm_crc32_u8(crc32bit, *p_buf++); + case 6: + crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*) p_buf); + p_buf += 2; + // case 5 is below: 4 + 1 + case 4: + crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*) p_buf); + break; + case 3: + crc32bit = _mm_crc32_u8(crc32bit, *p_buf++); + case 2: + crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*) p_buf); + break; + case 5: + crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*) p_buf); + p_buf += 4; + case 1: + crc32bit = _mm_crc32_u8(crc32bit, *p_buf); + break; + case 0: + break; + default: + // This should never happen; enable in debug code + assert(0 && "ended up with 8 or more bytes at tail of calculation"); + } + + return crc32bit; +} + +#ifdef USE_PIPELINED +/** + * Pipelined version of hardware-accelerated CRC32C calculation using + * the 64 bit crc32q instruction. + * One crc32c instruction takes three cycles, but two more with no data + * dependency can be in the pipeline to achieve something close to single + * instruction/cycle. Here we feed three blocks in RR. + * + * crc1, crc2, crc3 : Store initial checksum for each block before + * calling. When it returns, updated checksums are stored. + * p_buf : The base address of the data buffer. The buffer should be + * at least as big as block_size * num_blocks. + * block_size : The size of each block in bytes. + * num_blocks : The number of blocks to work on. Min = 1, Max = 3 + */ +static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks) { + uint64_t c1 = *crc1; + uint64_t c2 = *crc2; + uint64_t c3 = *crc3; + uint64_t *data = (uint64_t*)p_buf; + int counter = block_size / sizeof(uint64_t); + int remainder = block_size % sizeof(uint64_t); + uint8_t *bdata; + + /* We do switch here because the loop has to be tight in order + * to fill the pipeline. Any other statement inside the loop + * or inbetween crc32 instruction can slow things down. Calling + * individual crc32 instructions three times from C also causes + * gcc to insert other instructions inbetween. + * + * Do not rearrange the following code unless you have verified + * the generated machine code is as efficient as before. + */ + switch (num_blocks) { + case 3: + /* Do three blocks */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32q (%7), %0;\n\t" + "crc32q (%7,%6,1), %1;\n\t" + "crc32q (%7,%6,2), %2;\n\t" + : "=r"(c1), "=r"(c2), "=r"(c3) + : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(data) + ); + data++; + counter--; + } + + /* Take care of the remainder. They are only up to three bytes, + * so performing byte-level crc32 won't take much time. + */ + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%7), %0;\n\t" + "crc32b (%7,%6,1), %1;\n\t" + "crc32b (%7,%6,2), %2;\n\t" + : "=r"(c1), "=r"(c2), "=r"(c3) + : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 2: + /* Do two blocks */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32q (%5), %0;\n\t" + "crc32q (%5,%4,1), %1;\n\t" + : "=r"(c1), "=r"(c2) + : "r"(c1), "r"(c2), "r"(block_size), "r"(data) + ); + data++; + counter--; + } + + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%5), %0;\n\t" + "crc32b (%5,%4,1), %1;\n\t" + : "=r"(c1), "=r"(c2) + : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 1: + /* single block */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32q (%2), %0;\n\t" + : "=r"(c1) + : "r"(c1), "r"(data) + ); + data++; + counter--; + } + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%2), %0;\n\t" + : "=r"(c1) + : "r"(c1), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 0: + return; + default: + assert(0 && "BUG: Invalid number of checksum blocks"); + } + + *crc1 = c1; + *crc2 = c2; + *crc3 = c3; + return; +} +#endif /* USE_PIPELINED */ + +# else // 32-bit + +/** + * Hardware-accelerated CRC32C calculation using the 32-bit instructions. + */ +static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* p_buf, size_t length) { + // start directly at p_buf, even if it's an unaligned address. According + // to the original author of this code, doing a small run of single bytes + // to word-align the 64-bit instructions doesn't seem to help, but + // we haven't reconfirmed those benchmarks ourselves. + size_t i; + for (i = 0; i < length / sizeof(uint32_t); i++) { + crc = _mm_crc32_u32(crc, *(uint32_t*) p_buf); + p_buf += sizeof(uint32_t); + } + + // This ugly switch is slightly faster for short strings than the straightforward loop + length &= sizeof(uint32_t) - 1; + switch (length) { + case 3: + crc = _mm_crc32_u8(crc, *p_buf++); + case 2: + crc = _mm_crc32_u16(crc, *(uint16_t*) p_buf); + break; + case 1: + crc = _mm_crc32_u8(crc, *p_buf); + break; + case 0: + break; + default: + // This should never happen; enable in debug code + assert(0 && "ended up with 4 or more bytes at tail of calculation"); + } + + return crc; +} + +#ifdef USE_PIPELINED +/** + * Pipelined version of hardware-accelerated CRC32C calculation using + * the 32 bit crc32l instruction. + * One crc32c instruction takes three cycles, but two more with no data + * dependency can be in the pipeline to achieve something close to single + * instruction/cycle. Here we feed three blocks in RR. + * + * crc1, crc2, crc3 : Store initial checksum for each block before + * calling. When it returns, updated checksums are stored. + * data : The base address of the data buffer. The buffer should be + * at least as big as block_size * num_blocks. + * block_size : The size of each block in bytes. + * num_blocks : The number of blocks to work on. Min = 1, Max = 3 + */ +static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks) { + uint32_t c1 = *crc1; + uint32_t c2 = *crc2; + uint32_t c3 = *crc3; + int counter = block_size / sizeof(uint32_t); + int remainder = block_size % sizeof(uint32_t); + uint32_t *data = (uint32_t*)p_buf; + uint8_t *bdata; + + /* We do switch here because the loop has to be tight in order + * to fill the pipeline. Any other statement inside the loop + * or inbetween crc32 instruction can slow things down. Calling + * individual crc32 instructions three times from C also causes + * gcc to insert other instructions inbetween. + * + * Do not rearrange the following code unless you have verified + * the generated machine code is as efficient as before. + */ + switch (num_blocks) { + case 3: + /* Do three blocks */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32l (%7), %0;\n\t" + "crc32l (%7,%6,1), %1;\n\t" + "crc32l (%7,%6,2), %2;\n\t" + : "=r"(c1), "=r"(c2), "=r"(c3) + : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(data) + ); + data++; + counter--; + } + /* Take care of the remainder. They are only up to three bytes, + * so performing byte-level crc32 won't take much time. + */ + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%7), %0;\n\t" + "crc32b (%7,%6,1), %1;\n\t" + "crc32b (%7,%6,2), %2;\n\t" + : "=r"(c1), "=r"(c2), "=r"(c3) + : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 2: + /* Do two blocks */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32l (%5), %0;\n\t" + "crc32l (%5,%4,1), %1;\n\t" + : "=r"(c1), "=r"(c2) + : "r"(c1), "r"(c2), "r"(block_size), "r"(data) + ); + data++; + counter--; + } + + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%5), %0;\n\t" + "crc32b (%5,%4,1), %1;\n\t" + : "=r"(c1), "=r"(c2) + : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 1: + /* single block */ + while (likely(counter)) { + __asm__ __volatile__( + "crc32l (%2), %0;\n\t" + : "=r"(c1) + : "r"(c1), "r"(data) + ); + data++; + counter--; + } + bdata = (uint8_t*)data; + while (likely(remainder)) { + __asm__ __volatile__( + "crc32b (%2), %0;\n\t" + : "=r"(c1) + : "r"(c1), "r"(bdata) + ); + bdata++; + remainder--; + } + break; + case 0: + return; + default: + assert(0 && "BUG: Invalid number of checksum blocks"); + } + + *crc1 = c1; + *crc2 = c2; + *crc3 = c3; + return; +} + +#endif /* USE_PIPELINED */ + +# endif // 64-bit vs 32-bit + +#else // end x86 architecture + +static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* data, size_t length) { + // never called! + assert(0 && "hardware crc called on an unsupported platform"); + return 0; +} + +#endif