xref: /6.6.0/platform/src/crc32c_sse4_2.cc (revision 079961ee)
1/* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2  * Copyright (C) 2013 Mark Adler
3  * Version 1.1  1 Aug 2013  Mark Adler
4  */
5
6/*
7  This software is provided 'as-is', without any express or implied
8  warranty.  In no event will the author be held liable for any damages
9  arising from the use of this software.
10
11  Permission is granted to anyone to use this software for any purpose,
12  including commercial applications, and to alter it and redistribute it
13  freely, subject to the following restrictions:
14
15  1. The origin of this software must not be misrepresented; you must not
16     claim that you wrote the original software. If you use this software
17     in a product, an acknowledgment in the product documentation would be
18     appreciated but is not required.
19  2. Altered source versions must be plainly marked as such, and must not be
20     misrepresented as being the original software.
21  3. This notice may not be removed or altered from any source distribution.
22
23  Mark Adler
24  madler@alumni.caltech.edu
25 */
26
27//
28// Software and Hardware Assisted CRC32-C function.
29//
30// This is an altered/adapted version of Mark Adler's crc32c.c
31//  - see http://stackoverflow.com/a/17646775
32//  - see above license.
33//  - This module provides the HW support and is built
34//    with -msse4.2 where applicable. We only execute inside
35//    this module if SSE4.2 is detected.
36//
37// Changes from orginal version include.
38//  a) Compiler intrinsics instead of inline asm.
39//  b) Some re-styling, commenting and code style safety.
40//    i) no if or loops without braces.
41//    ii) variable initialisation.
42//  c) GCC/CLANG/MSVC safe.
43//  d) C++ casting and limits.
44//  e) Benchmarked and tuned.
45//    i) The 3way optimised version is slower for data sizes < 3xSHORT_BLOCK
46//       so fall back to a SHORT_BLOCK only mode or a single issue version.
47//    ii) See crc32c_bench.cc for testing
48//  f) Validated with IETF test vectors.
49//    i) See crc32c_test.cc.
50//  g) Use of GCC4.8 attributes to select SSE4.2 vs SW version/
51//  h) Custom cpuid code works for GCC(<4.8), CLANG and MSVC.
52//  i) Use static initialistion instead of pthread_once.
53//
54#if !defined(__x86_64__) && !defined(_M_X64) && !defined(_M_IX86)
55#error "crc32c requires X86 SSE4.2 for hardware acceleration"
56#endif
57
58#include "crc32c_private.h"
59#include <platform/crc32c.h>
60
61// select header file for crc instructions.
62#if defined(WIN32)
63#include <nmmintrin.h>
64#elif defined(__clang__) || defined(__GNUC__)
65#include <smmintrin.h>
66#endif
67
68#include <limits>
69
70#if defined(__i386) || defined(_M_IX86)
71typedef uint32_t crc_max_size_t;
72#define _mm_crc32_max_size _mm_crc32_u32
73#else
74typedef uint64_t crc_max_size_t;
75#define _mm_crc32_max_size _mm_crc32_u64
76#endif
77
78//
79// CRC32-C implementation using SSE4.2 acceleration
80// no pipeline optimisation.
81//
82PLATFORM_PUBLIC_API
83uint32_t crc32c_hw_1way(const uint8_t* buf, size_t len, uint32_t crc_in) {
84    crc_max_size_t crc = static_cast<crc_max_size_t>(~crc_in);
85    // use crc32-byte instruction until the buf pointer is 8-byte aligned
86    while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
87        crc = _mm_crc32_u8(static_cast<uint32_t>(crc), *buf);
88        buf += sizeof(uint8_t);
89        len -= sizeof(uint8_t);
90    }
91
92    // Use crc32_max size until there's no more u32/u64 to process.
93    while (len >= sizeof(crc_max_size_t)) {
94        crc = _mm_crc32_max_size(crc, *reinterpret_cast<const crc_max_size_t*>(buf));
95        buf += sizeof(crc_max_size_t);
96        len -= sizeof(crc_max_size_t);
97    }
98
99    // finish the rest using the byte instruction
100    while (len > 0) {
101        crc = _mm_crc32_u8(static_cast<uint32_t>(crc), *buf);
102        buf += sizeof(uint8_t);
103        len -= sizeof(uint8_t);
104    }
105
106    return static_cast<uint32_t>(crc ^ std::numeric_limits<uint32_t>::max());
107}
108
109//
110// HW assisted crc32c that processes as much data in parallel using 3xSHORT_BLOCKs
111//
112uint32_t crc32c_hw_short_block(const uint8_t* buf, size_t len, uint32_t crc_in) {
113    // If len is less the 3xSHORT_BLOCK just use the 1-way hw version
114    if (len < (3*SHORT_BLOCK)) {
115        return crc32c_hw_1way(buf, len, crc_in);
116    }
117
118    crc_max_size_t crc0 = static_cast<crc_max_size_t>(~crc_in), crc1 = 0, crc2 = 0;
119
120    // use crc32-byte instruction until the buf pointer is 8-byte aligned
121    while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
122
123        crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
124        buf += sizeof(uint8_t);
125        len -= sizeof(uint8_t);
126    }
127
128    // process the data using 3 pipelined crc working on 3 blocks of SHORT_BLOCK
129    while (len >= (3 * SHORT_BLOCK)) {
130        crc1 = 0;
131        crc2 = 0;
132        const uint8_t* end = buf + SHORT_BLOCK;
133        do
134        {
135            crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
136            crc1 = _mm_crc32_max_size(crc1, *reinterpret_cast<const crc_max_size_t*>(buf + SHORT_BLOCK));
137            crc2 = _mm_crc32_max_size(crc2, *reinterpret_cast<const crc_max_size_t*>(buf + (2 * SHORT_BLOCK)));
138            buf += sizeof(crc_max_size_t);
139        } while (buf < end);
140        crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc1;
141        crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc2;
142        buf += 2 * SHORT_BLOCK;
143        len -= 3 * SHORT_BLOCK;
144    }
145
146    // Use crc32_max size until there's no more u32/u64 to process.
147    while (len >= sizeof(crc_max_size_t)) {
148        crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
149        buf += sizeof(crc_max_size_t);
150        len -= sizeof(crc_max_size_t);
151    }
152
153    // finish the rest using the byte instruction
154    while (len > 0) {
155        crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
156        buf += sizeof(uint8_t);
157        len -= sizeof(uint8_t);
158    }
159
160    return static_cast<uint32_t>(crc0 ^ std::numeric_limits<uint32_t>::max());
161}
162
163
164//
165// A parallelised crc32c issuing 3 crc at once.
166// Generally 3 crc instructions can be issued at once.
167//
168PLATFORM_PUBLIC_API
169uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc_in) {
170    // if len is less than the long block it's faster to just process using 3way short-block
171    if (len < 3*LONG_BLOCK) {
172        return crc32c_hw_short_block(buf, len, crc_in);
173    }
174
175    crc_max_size_t crc0 = static_cast<crc_max_size_t>(~crc_in), crc1 = 0, crc2 = 0;
176
177    // use crc32-byte instruction until the buf pointer is 8-byte aligned
178    while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
179
180        crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
181        buf += sizeof(uint8_t);
182        len -= sizeof(uint8_t);
183    }
184
185    /* compute the crc on sets of LONG_BLOCK*3 bytes, executing three independent crc
186       instructions, each on LONG_BLOCK bytes -- this is optimized for the Nehalem,
187       Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
188       throughput of one crc per cycle, but a latency of three cycles */
189    while (len >= (3 * LONG_BLOCK)) {
190        crc1 = 0;
191        crc2 = 0;
192        const uint8_t* end = buf + LONG_BLOCK;
193        do
194        {
195            crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
196            crc1 = _mm_crc32_max_size(crc1, *reinterpret_cast<const crc_max_size_t*>(buf + LONG_BLOCK));
197            crc2 = _mm_crc32_max_size(crc2, *reinterpret_cast<const crc_max_size_t*>(buf + (2 * LONG_BLOCK)));
198            buf += sizeof(crc_max_size_t);
199        } while (buf < end);
200        crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc1;
201        crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc2;
202        buf += 2 * LONG_BLOCK;
203        len -= 3 * LONG_BLOCK;
204    }
205
206    /* do the same thing, but now on SHORT_BLOCK*3 blocks for the remaining data less
207       than a LONG_BLOCK*3 block */
208    while (len >= (3 * SHORT_BLOCK)) {
209        crc1 = 0;
210        crc2 = 0;
211        const uint8_t* end = buf + SHORT_BLOCK;
212        do
213        {
214            crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
215            crc1 = _mm_crc32_max_size(crc1, *reinterpret_cast<const crc_max_size_t*>(buf + SHORT_BLOCK));
216            crc2 = _mm_crc32_max_size(crc2, *reinterpret_cast<const crc_max_size_t*>(buf + (2 * SHORT_BLOCK)));
217            buf += sizeof(crc_max_size_t);
218        } while (buf < end);
219        crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc1;
220        crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc2;
221        buf += 2 * SHORT_BLOCK;
222        len -= 3 * SHORT_BLOCK;
223    }
224
225    // Use crc32_max size until there's no more u32/u64 to process.
226    while (len >= sizeof(crc_max_size_t)) {
227        crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
228        buf += sizeof(crc_max_size_t);
229        len -= sizeof(crc_max_size_t);
230    }
231
232    // finish the rest using the byte instruction
233    while (len > 0) {
234        crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
235        buf += sizeof(uint8_t);
236        len -= sizeof(uint8_t);
237    }
238
239    return static_cast<uint32_t>(crc0 ^ std::numeric_limits<uint32_t>::max());
240}
241