1 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2   * Copyright (C) 2013 Mark Adler
3   * Version 1.1  1 Aug 2013  Mark Adler
4   */
5 
6 /*
7   This software is provided 'as-is', without any express or implied
8   warranty.  In no event will the author be held liable for any damages
9   arising from the use of this software.
10 
11   Permission is granted to anyone to use this software for any purpose,
12   including commercial applications, and to alter it and redistribute it
13   freely, subject to the following restrictions:
14 
15   1. The origin of this software must not be misrepresented; you must not
16      claim that you wrote the original software. If you use this software
17      in a product, an acknowledgment in the product documentation would be
18      appreciated but is not required.
19   2. Altered source versions must be plainly marked as such, and must not be
20      misrepresented as being the original software.
21   3. This notice may not be removed or altered from any source distribution.
22 
23   Mark Adler
24   madler@alumni.caltech.edu
25  */
26 
27 //
28 // Software and Hardware Assisted CRC32-C function.
29 //
30 // This is an altered/adapted version of Mark Adler's crc32c.c
31 //  - see http://stackoverflow.com/a/17646775
32 //  - see above license.
33 //  - This module provides the HW support and is built
34 //    with -msse4.2 where applicable. We only execute inside
35 //    this module if SSE4.2 is detected.
36 //
37 // Changes from orginal version include.
38 //  a) Compiler intrinsics instead of inline asm.
39 //  b) Some re-styling, commenting and code style safety.
40 //    i) no if or loops without braces.
41 //    ii) variable initialisation.
42 //  c) GCC/CLANG/MSVC safe.
43 //  d) C++ casting and limits.
44 //  e) Benchmarked and tuned.
45 //    i) The 3way optimised version is slower for data sizes < 3xSHORT_BLOCK
46 //       so fall back to a SHORT_BLOCK only mode or a single issue version.
47 //    ii) See crc32c_bench.cc for testing
48 //  f) Validated with IETF test vectors.
49 //    i) See crc32c_test.cc.
50 //  g) Use of GCC4.8 attributes to select SSE4.2 vs SW version/
51 //  h) Custom cpuid code works for GCC(<4.8), CLANG and MSVC.
52 //  i) Use static initialistion instead of pthread_once.
53 //
54 
55 #include "platform/crc32c.h"
56 #include "crc32c_private.h"
57 
58 // select header file for crc instructions.
59 #if defined(WIN32)
60 #include <nmmintrin.h>
61 #elif defined(__clang__)
62 #include <smmintrin.h>
63 #elif defined(__GNUC__)
64 #include <smmintrin.h>
65 #endif
66 
67 #include <limits>
68 
69 #if defined(__i386) || defined(_M_IX86)
70 typedef uint32_t crc_max_size_t;
71 #define _mm_crc32_max_size _mm_crc32_u32
72 #else
73 typedef uint64_t crc_max_size_t;
74 #define _mm_crc32_max_size _mm_crc32_u64
75 #endif
76 
77 //
78 // CRC32-C implementation using SSE4.2 acceleration
79 // no pipeline optimisation.
80 //
crc32c_hw_1way(const uint8_t* buf, size_t len, uint32_t crc_in)81 uint32_t crc32c_hw_1way(const uint8_t* buf, size_t len, uint32_t crc_in) {
82     crc_max_size_t crc = static_cast<crc_max_size_t>(~crc_in);
83     // use crc32-byte instruction until the buf pointer is 8-byte aligned
84     while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
85         crc = _mm_crc32_u8(static_cast<uint32_t>(crc), *buf);
86         buf += sizeof(uint8_t);
87         len -= sizeof(uint8_t);
88     }
89 
90     // Use crc32_max size until there's no more u32/u64 to process.
91     while (len >= sizeof(crc_max_size_t)) {
92         crc = _mm_crc32_max_size(crc, *reinterpret_cast<const crc_max_size_t*>(buf));
93         buf += sizeof(crc_max_size_t);
94         len -= sizeof(crc_max_size_t);
95     }
96 
97     // finish the rest using the byte instruction
98     while (len > 0) {
99         crc = _mm_crc32_u8(static_cast<uint32_t>(crc), *buf);
100         buf += sizeof(uint8_t);
101         len -= sizeof(uint8_t);
102     }
103 
104     return static_cast<uint32_t>(crc ^ std::numeric_limits<uint32_t>::max());
105 }
106 
107 //
108 // HW assisted crc32c that processes as much data in parallel using 3xSHORT_BLOCKs
109 //
crc32c_hw_short_block(const uint8_t* buf, size_t len, uint32_t crc_in)110 uint32_t crc32c_hw_short_block(const uint8_t* buf, size_t len, uint32_t crc_in) {
111     // If len is less the 3xSHORT_BLOCK just use the 1-way hw version
112     if (len < (3*SHORT_BLOCK)) {
113         return crc32c_hw_1way(buf, len, crc_in);
114     }
115 
116     crc_max_size_t crc0 = static_cast<crc_max_size_t>(~crc_in), crc1 = 0, crc2 = 0;
117 
118     // use crc32-byte instruction until the buf pointer is 8-byte aligned
119     while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
120 
121         crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
122         buf += sizeof(uint8_t);
123         len -= sizeof(uint8_t);
124     }
125 
126     // process the data using 3 pipelined crc working on 3 blocks of SHORT_BLOCK
127     while (len >= (3 * SHORT_BLOCK)) {
128         crc1 = 0;
129         crc2 = 0;
130         const uint8_t* end = buf + SHORT_BLOCK;
131         do
132         {
133             crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
134             crc1 = _mm_crc32_max_size(crc1, *reinterpret_cast<const crc_max_size_t*>(buf + SHORT_BLOCK));
135             crc2 = _mm_crc32_max_size(crc2, *reinterpret_cast<const crc_max_size_t*>(buf + (2 * SHORT_BLOCK)));
136             buf += sizeof(crc_max_size_t);
137         } while (buf < end);
138         crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc1;
139         crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc2;
140         buf += 2 * SHORT_BLOCK;
141         len -= 3 * SHORT_BLOCK;
142     }
143 
144     // Use crc32_max size until there's no more u32/u64 to process.
145     while (len >= sizeof(crc_max_size_t)) {
146         crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
147         buf += sizeof(crc_max_size_t);
148         len -= sizeof(crc_max_size_t);
149     }
150 
151     // finish the rest using the byte instruction
152     while (len > 0) {
153         crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
154         buf += sizeof(uint8_t);
155         len -= sizeof(uint8_t);
156     }
157 
158     return static_cast<uint32_t>(crc0 ^ std::numeric_limits<uint32_t>::max());
159 }
160 
161 
162 //
163 // A parallelised crc32c issuing 3 crc at once.
164 // Generally 3 crc instructions can be issued at once.
165 //
crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc_in)166 uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc_in) {
167     // if len is less than the long block it's faster to just process using 3way short-block
168     if (len < 3*LONG_BLOCK) {
169         return crc32c_hw_short_block(buf, len, crc_in);
170     }
171 
172     crc_max_size_t crc0 = static_cast<crc_max_size_t>(~crc_in), crc1 = 0, crc2 = 0;
173 
174     // use crc32-byte instruction until the buf pointer is 8-byte aligned
175     while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
176 
177         crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
178         buf += sizeof(uint8_t);
179         len -= sizeof(uint8_t);
180     }
181 
182     /* compute the crc on sets of LONG_BLOCK*3 bytes, executing three independent crc
183        instructions, each on LONG_BLOCK bytes -- this is optimized for the Nehalem,
184        Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
185        throughput of one crc per cycle, but a latency of three cycles */
186     while (len >= (3 * LONG_BLOCK)) {
187         crc1 = 0;
188         crc2 = 0;
189         const uint8_t* end = buf + LONG_BLOCK;
190         do
191         {
192             crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
193             crc1 = _mm_crc32_max_size(crc1, *reinterpret_cast<const crc_max_size_t*>(buf + LONG_BLOCK));
194             crc2 = _mm_crc32_max_size(crc2, *reinterpret_cast<const crc_max_size_t*>(buf + (2 * LONG_BLOCK)));
195             buf += sizeof(crc_max_size_t);
196         } while (buf < end);
197         crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc1;
198         crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc2;
199         buf += 2 * LONG_BLOCK;
200         len -= 3 * LONG_BLOCK;
201     }
202 
203     /* do the same thing, but now on SHORT_BLOCK*3 blocks for the remaining data less
204        than a LONG_BLOCK*3 block */
205     while (len >= (3 * SHORT_BLOCK)) {
206         crc1 = 0;
207         crc2 = 0;
208         const uint8_t* end = buf + SHORT_BLOCK;
209         do
210         {
211             crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
212             crc1 = _mm_crc32_max_size(crc1, *reinterpret_cast<const crc_max_size_t*>(buf + SHORT_BLOCK));
213             crc2 = _mm_crc32_max_size(crc2, *reinterpret_cast<const crc_max_size_t*>(buf + (2 * SHORT_BLOCK)));
214             buf += sizeof(crc_max_size_t);
215         } while (buf < end);
216         crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc1;
217         crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc2;
218         buf += 2 * SHORT_BLOCK;
219         len -= 3 * SHORT_BLOCK;
220     }
221 
222     // Use crc32_max size until there's no more u32/u64 to process.
223     while (len >= sizeof(crc_max_size_t)) {
224         crc0 = _mm_crc32_max_size(crc0, *reinterpret_cast<const crc_max_size_t*>(buf));
225         buf += sizeof(crc_max_size_t);
226         len -= sizeof(crc_max_size_t);
227     }
228 
229     // finish the rest using the byte instruction
230     while (len > 0) {
231         crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *buf);
232         buf += sizeof(uint8_t);
233         len -= sizeof(uint8_t);
234     }
235 
236     return static_cast<uint32_t>(crc0 ^ std::numeric_limits<uint32_t>::max());
237 }
238