// Copyright 2008 The CRC32C Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "./crc32c_sse42.h" // In a separate source file to allow this accelerated CRC32C function to be // compiled with the appropriate compiler flags to enable SSE4.2 instructions. // This implementation is loosely based on Intel Pub 323405 from April 2011, // "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction". #include #include #include "./crc32c_internal.h" #include "./crc32c_prefetch.h" #include "./crc32c_read_le.h" #include "./crc32c_round_up.h" #include "crc32c/crc32c_config.h" #if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) #if defined(_MSC_VER) #include #else // !defined(_MSC_VER) #include #endif // defined(_MSC_VER) namespace crc32c { namespace { constexpr const ptrdiff_t kGroups = 3; constexpr const ptrdiff_t kBlock0Size = 16 * 1024 / kGroups / 64 * 64; constexpr const ptrdiff_t kBlock1Size = 4 * 1024 / kGroups / 8 * 8; constexpr const ptrdiff_t kBlock2Size = 1024 / kGroups / 8 * 8; const uint32_t kBlock0SkipTable[8][16] = { {0x00000000, 0xff770459, 0xfb027e43, 0x04757a1a, 0xf3e88a77, 0x0c9f8e2e, 0x08eaf434, 0xf79df06d, 0xe23d621f, 0x1d4a6646, 0x193f1c5c, 0xe6481805, 0x11d5e868, 0xeea2ec31, 0xead7962b, 0x15a09272}, {0x00000000, 0xc196b2cf, 0x86c1136f, 0x4757a1a0, 0x086e502f, 0xc9f8e2e0, 0x8eaf4340, 0x4f39f18f, 0x10dca05e, 0xd14a1291, 0x961db331, 0x578b01fe, 0x18b2f071, 0xd92442be, 0x9e73e31e, 0x5fe551d1}, {0x00000000, 0x21b940bc, 0x43728178, 0x62cbc1c4, 0x86e502f0, 0xa75c424c, 0xc5978388, 0xe42ec334, 0x08267311, 0x299f33ad, 0x4b54f269, 0x6aedb2d5, 0x8ec371e1, 0xaf7a315d, 0xcdb1f099, 0xec08b025}, {0x00000000, 0x104ce622, 0x2099cc44, 0x30d52a66, 0x41339888, 0x517f7eaa, 0x61aa54cc, 0x71e6b2ee, 0x82673110, 0x922bd732, 0xa2fefd54, 0xb2b21b76, 0xc354a998, 0xd3184fba, 0xe3cd65dc, 0xf38183fe}, {0x00000000, 0x012214d1, 0x024429a2, 0x03663d73, 0x04885344, 0x05aa4795, 0x06cc7ae6, 0x07ee6e37, 0x0910a688, 0x0832b259, 0x0b548f2a, 0x0a769bfb, 0x0d98f5cc, 0x0cbae11d, 0x0fdcdc6e, 0x0efec8bf}, {0x00000000, 0x12214d10, 0x24429a20, 0x3663d730, 0x48853440, 0x5aa47950, 0x6cc7ae60, 0x7ee6e370, 0x910a6880, 0x832b2590, 0xb548f2a0, 0xa769bfb0, 0xd98f5cc0, 0xcbae11d0, 0xfdcdc6e0, 0xefec8bf0}, {0x00000000, 0x27f8a7f1, 0x4ff14fe2, 0x6809e813, 0x9fe29fc4, 0xb81a3835, 0xd013d026, 0xf7eb77d7, 0x3a294979, 0x1dd1ee88, 0x75d8069b, 0x5220a16a, 0xa5cbd6bd, 0x8233714c, 0xea3a995f, 0xcdc23eae}, {0x00000000, 0x745292f2, 0xe8a525e4, 0x9cf7b716, 0xd4a63d39, 0xa0f4afcb, 0x3c0318dd, 0x48518a2f, 0xaca00c83, 0xd8f29e71, 0x44052967, 0x3057bb95, 0x780631ba, 0x0c54a348, 0x90a3145e, 0xe4f186ac}, }; const uint32_t kBlock1SkipTable[8][16] = { {0x00000000, 0x79113270, 0xf22264e0, 0x8b335690, 0xe1a8bf31, 0x98b98d41, 0x138adbd1, 0x6a9be9a1, 0xc6bd0893, 0xbfac3ae3, 0x349f6c73, 0x4d8e5e03, 0x2715b7a2, 0x5e0485d2, 0xd537d342, 0xac26e132}, {0x00000000, 0x889667d7, 0x14c0b95f, 0x9c56de88, 0x298172be, 0xa1171569, 0x3d41cbe1, 0xb5d7ac36, 0x5302e57c, 0xdb9482ab, 0x47c25c23, 0xcf543bf4, 0x7a8397c2, 0xf215f015, 0x6e432e9d, 0xe6d5494a}, {0x00000000, 0xa605caf8, 0x49e7e301, 0xefe229f9, 0x93cfc602, 0x35ca0cfa, 0xda282503, 0x7c2deffb, 0x2273faf5, 0x8476300d, 0x6b9419f4, 0xcd91d30c, 0xb1bc3cf7, 0x17b9f60f, 0xf85bdff6, 0x5e5e150e}, {0x00000000, 0x44e7f5ea, 0x89cfebd4, 0xcd281e3e, 0x1673a159, 0x529454b3, 0x9fbc4a8d, 0xdb5bbf67, 0x2ce742b2, 0x6800b758, 0xa528a966, 0xe1cf5c8c, 0x3a94e3eb, 0x7e731601, 0xb35b083f, 0xf7bcfdd5}, {0x00000000, 0x59ce8564, 0xb39d0ac8, 0xea538fac, 0x62d66361, 0x3b18e605, 0xd14b69a9, 0x8885eccd, 0xc5acc6c2, 0x9c6243a6, 0x7631cc0a, 0x2fff496e, 0xa77aa5a3, 0xfeb420c7, 0x14e7af6b, 0x4d292a0f}, {0x00000000, 0x8eb5fb75, 0x1887801b, 0x96327b6e, 0x310f0036, 0xbfbafb43, 0x2988802d, 0xa73d7b58, 0x621e006c, 0xecabfb19, 0x7a998077, 0xf42c7b02, 0x5311005a, 0xdda4fb2f, 0x4b968041, 0xc5237b34}, {0x00000000, 0xc43c00d8, 0x8d947741, 0x49a87799, 0x1ec49873, 0xdaf898ab, 0x9350ef32, 0x576cefea, 0x3d8930e6, 0xf9b5303e, 0xb01d47a7, 0x7421477f, 0x234da895, 0xe771a84d, 0xaed9dfd4, 0x6ae5df0c}, {0x00000000, 0x7b1261cc, 0xf624c398, 0x8d36a254, 0xe9a5f1c1, 0x92b7900d, 0x1f813259, 0x64935395, 0xd6a79573, 0xadb5f4bf, 0x208356eb, 0x5b913727, 0x3f0264b2, 0x4410057e, 0xc926a72a, 0xb234c6e6}, }; const uint32_t kBlock2SkipTable[8][16] = { {0x00000000, 0x8f158014, 0x1bc776d9, 0x94d2f6cd, 0x378eedb2, 0xb89b6da6, 0x2c499b6b, 0xa35c1b7f, 0x6f1ddb64, 0xe0085b70, 0x74daadbd, 0xfbcf2da9, 0x589336d6, 0xd786b6c2, 0x4354400f, 0xcc41c01b}, {0x00000000, 0xde3bb6c8, 0xb99b1b61, 0x67a0ada9, 0x76da4033, 0xa8e1f6fb, 0xcf415b52, 0x117aed9a, 0xedb48066, 0x338f36ae, 0x542f9b07, 0x8a142dcf, 0x9b6ec055, 0x4555769d, 0x22f5db34, 0xfcce6dfc}, {0x00000000, 0xde85763d, 0xb8e69a8b, 0x6663ecb6, 0x742143e7, 0xaaa435da, 0xccc7d96c, 0x1242af51, 0xe84287ce, 0x36c7f1f3, 0x50a41d45, 0x8e216b78, 0x9c63c429, 0x42e6b214, 0x24855ea2, 0xfa00289f}, {0x00000000, 0xd569796d, 0xaf3e842b, 0x7a57fd46, 0x5b917ea7, 0x8ef807ca, 0xf4affa8c, 0x21c683e1, 0xb722fd4e, 0x624b8423, 0x181c7965, 0xcd750008, 0xecb383e9, 0x39dafa84, 0x438d07c2, 0x96e47eaf}, {0x00000000, 0x6ba98c6d, 0xd75318da, 0xbcfa94b7, 0xab4a4745, 0xc0e3cb28, 0x7c195f9f, 0x17b0d3f2, 0x5378f87b, 0x38d17416, 0x842be0a1, 0xef826ccc, 0xf832bf3e, 0x939b3353, 0x2f61a7e4, 0x44c82b89}, {0x00000000, 0xa6f1f0f6, 0x480f971d, 0xeefe67eb, 0x901f2e3a, 0x36eedecc, 0xd810b927, 0x7ee149d1, 0x25d22a85, 0x8323da73, 0x6dddbd98, 0xcb2c4d6e, 0xb5cd04bf, 0x133cf449, 0xfdc293a2, 0x5b336354}, {0x00000000, 0x4ba4550a, 0x9748aa14, 0xdcecff1e, 0x2b7d22d9, 0x60d977d3, 0xbc3588cd, 0xf791ddc7, 0x56fa45b2, 0x1d5e10b8, 0xc1b2efa6, 0x8a16baac, 0x7d87676b, 0x36233261, 0xeacfcd7f, 0xa16b9875}, {0x00000000, 0xadf48b64, 0x5e056039, 0xf3f1eb5d, 0xbc0ac072, 0x11fe4b16, 0xe20fa04b, 0x4ffb2b2f, 0x7df9f615, 0xd00d7d71, 0x23fc962c, 0x8e081d48, 0xc1f33667, 0x6c07bd03, 0x9ff6565e, 0x3202dd3a}, }; constexpr const ptrdiff_t kPrefetchHorizon = 256; } // namespace uint32_t ExtendSse42(uint32_t crc, const uint8_t* data, size_t size) { const uint8_t* p = data; const uint8_t* e = data + size; uint32_t l = crc ^ kCRC32Xor; #define STEP1 \ do { \ l = _mm_crc32_u8(l, *p++); \ } while (0) #define STEP4(crc) \ do { \ crc = _mm_crc32_u32(crc, ReadUint32LE(p)); \ p += 4; \ } while (0) #define STEP8(crc, data) \ do { \ crc = _mm_crc32_u64(crc, ReadUint64LE(data)); \ data += 8; \ } while (0) #define STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \ do { \ STEP8(crc0, p0); \ STEP8(crc1, p1); \ STEP8(crc2, p2); \ } while (0) #define STEP8X3(crc0, crc1, crc2, bs) \ do { \ crc0 = _mm_crc32_u64(crc0, ReadUint64LE(p)); \ crc1 = _mm_crc32_u64(crc1, ReadUint64LE(p + bs)); \ crc2 = _mm_crc32_u64(crc2, ReadUint64LE(p + 2 * bs)); \ p += 8; \ } while (0) #define SKIP_BLOCK(crc, tab) \ do { \ crc = tab[0][crc & 0xf] ^ tab[1][(crc >> 4) & 0xf] ^ \ tab[2][(crc >> 8) & 0xf] ^ tab[3][(crc >> 12) & 0xf] ^ \ tab[4][(crc >> 16) & 0xf] ^ tab[5][(crc >> 20) & 0xf] ^ \ tab[6][(crc >> 24) & 0xf] ^ tab[7][(crc >> 28) & 0xf]; \ } while (0) // Point x at first 8-byte aligned byte in the buffer. This might be past the // end of the buffer. const uint8_t* x = RoundUp<8>(p); if (x <= e) { // Process bytes p is 8-byte aligned. while (p != x) { STEP1; } } // Proccess the data in predetermined block sizes with tables for quickly // combining the checksum. Experimentally it's better to use larger block // sizes where possible so use a hierarchy of decreasing block sizes. uint64_t l64 = l; while ((e - p) >= kGroups * kBlock0Size) { uint64_t l641 = 0; uint64_t l642 = 0; for (int i = 0; i < kBlock0Size; i += 8 * 8) { // Prefetch ahead to hide latency. RequestPrefetch(p + kPrefetchHorizon); RequestPrefetch(p + kBlock0Size + kPrefetchHorizon); RequestPrefetch(p + 2 * kBlock0Size + kPrefetchHorizon); // Process 64 bytes at a time. STEP8X3(l64, l641, l642, kBlock0Size); STEP8X3(l64, l641, l642, kBlock0Size); STEP8X3(l64, l641, l642, kBlock0Size); STEP8X3(l64, l641, l642, kBlock0Size); STEP8X3(l64, l641, l642, kBlock0Size); STEP8X3(l64, l641, l642, kBlock0Size); STEP8X3(l64, l641, l642, kBlock0Size); STEP8X3(l64, l641, l642, kBlock0Size); } // Combine results. SKIP_BLOCK(l64, kBlock0SkipTable); l64 ^= l641; SKIP_BLOCK(l64, kBlock0SkipTable); l64 ^= l642; p += (kGroups - 1) * kBlock0Size; } while ((e - p) >= kGroups * kBlock1Size) { uint64_t l641 = 0; uint64_t l642 = 0; for (int i = 0; i < kBlock1Size; i += 8) { STEP8X3(l64, l641, l642, kBlock1Size); } SKIP_BLOCK(l64, kBlock1SkipTable); l64 ^= l641; SKIP_BLOCK(l64, kBlock1SkipTable); l64 ^= l642; p += (kGroups - 1) * kBlock1Size; } while ((e - p) >= kGroups * kBlock2Size) { uint64_t l641 = 0; uint64_t l642 = 0; for (int i = 0; i < kBlock2Size; i += 8) { STEP8X3(l64, l641, l642, kBlock2Size); } SKIP_BLOCK(l64, kBlock2SkipTable); l64 ^= l641; SKIP_BLOCK(l64, kBlock2SkipTable); l64 ^= l642; p += (kGroups - 1) * kBlock2Size; } // Process bytes 16 at a time while ((e - p) >= 16) { STEP8(l64, p); STEP8(l64, p); } l = static_cast(l64); // Process the last few bytes. while (p != e) { STEP1; } #undef SKIP_BLOCK #undef STEP8X3 #undef STEP8BY3 #undef STEP8 #undef STEP4 #undef STEP1 return l ^ kCRC32Xor; } } // namespace crc32c #endif // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))