1 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2 * Copyright (C) 2013 Mark Adler
3 * Version 1.1 1 Aug 2013 Mark Adler
4 */
5
6 /*
7 This software is provided 'as-is', without any express or implied
8 warranty. In no event will the author be held liable for any damages
9 arising from the use of this software.
10
11 Permission is granted to anyone to use this software for any purpose,
12 including commercial applications, and to alter it and redistribute it
13 freely, subject to the following restrictions:
14
15 1. The origin of this software must not be misrepresented; you must not
16 claim that you wrote the original software. If you use this software
17 in a product, an acknowledgment in the product documentation would be
18 appreciated but is not required.
19 2. Altered source versions must be plainly marked as such, and must not be
20 misrepresented as being the original software.
21 3. This notice may not be removed or altered from any source distribution.
22
23 Mark Adler
24 madler@alumni.caltech.edu
25 */
26
27 //
28 // Software and Hardware Assisted CRC32-C function.
29 //
30 // This is an altered/adapted version of Mark Adler's crc32c.c
31 // - see http://stackoverflow.com/a/17646775
32 // - see above license.
33 // - This module provides the HW support and is built
34 // with -msse4.2 where applicable. We only execute inside
35 // this module if SSE4.2 is detected.
36 //
37 // Changes from orginal version include.
38 // a) Compiler intrinsics instead of inline asm.
39 // b) Some re-styling, commenting and code style safety.
40 // i) no if or loops without braces.
41 // ii) variable initialisation.
42 // c) GCC/CLANG/MSVC safe.
43 // d) C++ casting and limits.
44 // e) Benchmarked and tuned.
45 // i) The 3way optimised version is slower for data sizes < 3xSHORT_BLOCK
46 // so fall back to a SHORT_BLOCK only mode or a single issue version.
47 // ii) See crc32c_bench.cc for testing
48 // f) Validated with IETF test vectors.
49 // i) See crc32c_test.cc.
50 // g) Use of GCC4.8 attributes to select SSE4.2 vs SW version/
51 // h) Custom cpuid code works for GCC(<4.8), CLANG and MSVC.
52 // i) Use static initialistion instead of pthread_once.
53 //
54
55 #include "platform/crc32c.h"
56 #include "crc32c_private.h"
57
58 // select header file for crc instructions.
59 #if defined(WIN32)
60 #include <nmmintrin.h>
61 #elif defined(__clang__)
62 #include <smmintrin.h>
63 #elif defined(__GNUC__)
64 #include <smmintrin.h>
65 #endif
66
67 #include <limits>
68
69 #if defined(__i386) || defined(_M_IX86)
70 typedef uint32_t crc_max_size_t;
71 #define _mm_crc32_max_size _mm_crc32_u32
72 #else
73 typedef uint64_t crc_max_size_t;
74 #define _mm_crc32_max_size _mm_crc32_u64
75 #endif
76
77 //
78 // CRC32-C implementation using SSE4.2 acceleration
79 // no pipeline optimisation.
80 //
crc32c_hw_1way(const uint8_t* buf, size_t len, uint32_t crc_in)81 uint32_t crc32c_hw_1way(const uint8_t* buf, size_t len, uint32_t crc_in) {
82 crc_max_size_t crc = static_cast<crc_max_size_t>(~crc_in);
83 // use crc32-byte instruction until the buf pointer is 8-byte aligned
84 while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
85 crc = _mm_crc32_u8(static_cast<uint32_t>(crc), *buf);
86 buf += sizeof(uint8_t);
87 len -= sizeof(uint8_t);
88 }
89
90 // Use crc32_max size until there's no more u32/u64 to process.
91 while (len >= sizeof(crc_max_size_t)) {
92 crc = _mm_crc32_max_size(crc, *reinterpret_cast<const crc_max_size_t*>(buf));
93 buf += sizeof(crc_max_size_t);
94 len -= sizeof(crc_max_size_t);
95 }
96
97 // finish the rest using the byte instruction
98 while (len > 0) {
99 crc = _mm_crc32_u8(static_cast<uint32_t>(crc), *buf);
100 buf += sizeof(uint8_t);
101 len -= sizeof(uint8_t);
102 }
103
104 return static_cast<uint32_t>(crc ^ std::numeric_limits<uint32_t>::max());
105 }
106
107 //
108 // HW assisted crc32c that processes as much data in parallel using 3xSHORT_BLOCKs
109 //
crc32c_hw_short_block(const uint8_t* buf, size_t len, uint32_t crc_in)110 uint32_t crc32c_hw_short_block(const uint8_t* buf, size_t len, uint32_t crc_in) {
111 // If len is less the 3xSHORT_BLOCK just use the 1-way hw version
112 if (len < (3*SHORT_BLOCK)) {
113 return crc32c_hw_1way(buf, len, crc_in);
114 }
115
116 crc_max_size_t crc0 = static_cast<crc_max_size_t>(~crc_in), crc1 = 0, crc2 = 0;
117
118 // use crc32-byte instruction until the buf pointer is 8-byte aligned
119 while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
120
121 crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
122 buf += sizeof(uint8_t);
123 len -= sizeof(uint8_t);
124 }
125
126 // process the data using 3 pipelined crc working on 3 blocks of SHORT_BLOCK
127 while (len >= (3 * SHORT_BLOCK)) {
128 crc1 = 0;
129 crc2 = 0;
130 const uint8_t* end = buf + SHORT_BLOCK;
131 do
132 {
133 crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
134 crc1 = _mm_crc32_max_size(crc1, *reinterpret_cast<const crc_max_size_t*>(buf + SHORT_BLOCK));
135 crc2 = _mm_crc32_max_size(crc2, *reinterpret_cast<const crc_max_size_t*>(buf + (2 * SHORT_BLOCK)));
136 buf += sizeof(crc_max_size_t);
137 } while (buf < end);
138 crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc1;
139 crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc2;
140 buf += 2 * SHORT_BLOCK;
141 len -= 3 * SHORT_BLOCK;
142 }
143
144 // Use crc32_max size until there's no more u32/u64 to process.
145 while (len >= sizeof(crc_max_size_t)) {
146 crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
147 buf += sizeof(crc_max_size_t);
148 len -= sizeof(crc_max_size_t);
149 }
150
151 // finish the rest using the byte instruction
152 while (len > 0) {
153 crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
154 buf += sizeof(uint8_t);
155 len -= sizeof(uint8_t);
156 }
157
158 return static_cast<uint32_t>(crc0 ^ std::numeric_limits<uint32_t>::max());
159 }
160
161
162 //
163 // A parallelised crc32c issuing 3 crc at once.
164 // Generally 3 crc instructions can be issued at once.
165 //
crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc_in)166 uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc_in) {
167 // if len is less than the long block it's faster to just process using 3way short-block
168 if (len < 3*LONG_BLOCK) {
169 return crc32c_hw_short_block(buf, len, crc_in);
170 }
171
172 crc_max_size_t crc0 = static_cast<crc_max_size_t>(~crc_in), crc1 = 0, crc2 = 0;
173
174 // use crc32-byte instruction until the buf pointer is 8-byte aligned
175 while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
176
177 crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
178 buf += sizeof(uint8_t);
179 len -= sizeof(uint8_t);
180 }
181
182 /* compute the crc on sets of LONG_BLOCK*3 bytes, executing three independent crc
183 instructions, each on LONG_BLOCK bytes -- this is optimized for the Nehalem,
184 Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
185 throughput of one crc per cycle, but a latency of three cycles */
186 while (len >= (3 * LONG_BLOCK)) {
187 crc1 = 0;
188 crc2 = 0;
189 const uint8_t* end = buf + LONG_BLOCK;
190 do
191 {
192 crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
193 crc1 = _mm_crc32_max_size(crc1, *reinterpret_cast<const crc_max_size_t*>(buf + LONG_BLOCK));
194 crc2 = _mm_crc32_max_size(crc2, *reinterpret_cast<const crc_max_size_t*>(buf + (2 * LONG_BLOCK)));
195 buf += sizeof(crc_max_size_t);
196 } while (buf < end);
197 crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc1;
198 crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc2;
199 buf += 2 * LONG_BLOCK;
200 len -= 3 * LONG_BLOCK;
201 }
202
203 /* do the same thing, but now on SHORT_BLOCK*3 blocks for the remaining data less
204 than a LONG_BLOCK*3 block */
205 while (len >= (3 * SHORT_BLOCK)) {
206 crc1 = 0;
207 crc2 = 0;
208 const uint8_t* end = buf + SHORT_BLOCK;
209 do
210 {
211 crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
212 crc1 = _mm_crc32_max_size(crc1, *reinterpret_cast<const crc_max_size_t*>(buf + SHORT_BLOCK));
213 crc2 = _mm_crc32_max_size(crc2, *reinterpret_cast<const crc_max_size_t*>(buf + (2 * SHORT_BLOCK)));
214 buf += sizeof(crc_max_size_t);
215 } while (buf < end);
216 crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc1;
217 crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc2;
218 buf += 2 * SHORT_BLOCK;
219 len -= 3 * SHORT_BLOCK;
220 }
221
222 // Use crc32_max size until there's no more u32/u64 to process.
223 while (len >= sizeof(crc_max_size_t)) {
224 crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
225 buf += sizeof(crc_max_size_t);
226 len -= sizeof(crc_max_size_t);
227 }
228
229 // finish the rest using the byte instruction
230 while (len > 0) {
231 crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
232 buf += sizeof(uint8_t);
233 len -= sizeof(uint8_t);
234 }
235
236 return static_cast<uint32_t>(crc0 ^ std::numeric_limits<uint32_t>::max());
237 }
238