xref: /6.6.0/platform/src/crc32c.cc (revision 87c134a8)
1 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2  * Copyright (C) 2013 Mark Adler
3  * Version 1.1  1 Aug 2013  Mark Adler
4  */
5
6/*
7  This software is provided 'as-is', without any express or implied
8  warranty.  In no event will the author be held liable for any damages
9  arising from the use of this software.
10
11  Permission is granted to anyone to use this software for any purpose,
12  including commercial applications, and to alter it and redistribute it
13  freely, subject to the following restrictions:
14
15  1. The origin of this software must not be misrepresented; you must not
16     claim that you wrote the original software. If you use this software
17     in a product, an acknowledgment in the product documentation would be
18     appreciated but is not required.
19  2. Altered source versions must be plainly marked as such, and must not be
20     misrepresented as being the original software.
21  3. This notice may not be removed or altered from any source distribution.
22
23  Mark Adler
24  madler@alumni.caltech.edu
25 */
26
27//
28// Software and Hardware Assisted CRC32-C function.
29//
30// This is an altered/adapted version of Mark Adler's crc32c.c
31//  - See http://stackoverflow.com/a/17646775
32//  - See above license.
33//  - This module provides a software only crc32c
34//    and cpuid code to enable HW assist.
35//
36// Changes from orginal version include.
37//  a) Compiler intrinsics instead of inline asm.
38//  b) Some re-styling, commenting and code style safety.
39//    i) no if or loops without braces.
40//    ii) variable initialisation.
41//  c) GCC/CLANG/MSVC safe.
42//  d) C++ casting and limits.
43//  e) Benchmarked and tuned.
44//    i) The 3way optimised version is slower for data sizes < 3xSHORT_BLOCK
45//       so fall back to a SHORT_BLOCK only mode or a single issue version.
46//    ii) See crc32c_bench.cc for testing
47//  f) Validated with IETF test vectors.
48//    i) See crc32c_test.cc.
49//  g) Use of GCC4.8 attributes to select SSE4.2 vs SW version/
50//  h) Custom cpuid code works for GCC(<4.8), CLANG and MSVC.
51//  i) Use static initialistion instead of pthread_once.
52//
53
54#include "platform/crc32c.h"
55#include "crc32c_private.h"
56
57#include <stdint.h>
58#include <stddef.h>
59
60// select header file for cpuid.
61#if defined(WIN32)
62#include <intrin.h>
63#elif defined(__clang__) || defined(__GNUC__)
64#include <cpuid.h>
65#endif
66
67#include <limits>
68#include <array>
69
70typedef uint32_t (*crc32c_function) (const uint8_t* buf, size_t len, uint32_t crc_in);
71
72static bool setup_tables();
73static bool tables_setup = setup_tables();
74
75const uint32_t CRC32C_POLYNOMIAL_REV = 0x82F63B78;
76const int TABLE_X = 8, TABLE_Y = 256;
77static uint32_t crc32c_sw_lookup_table[TABLE_X][TABLE_Y];
78/* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
79uint32_t crc32c_long[SHIFT_TABLE_X][SHIFT_TABLE_Y];
80uint32_t crc32c_short[SHIFT_TABLE_X][SHIFT_TABLE_Y];
81
82/* Multiply a matrix times a vector over the Galois field of two elements,
83   GF(2).  Each element is a bit in an unsigned integer.  mat must have at
84   least as many entries as the power of two for most significant one bit in
85   vec. */
86static inline uint32_t gf2_matrix_times(const uint32_t *mat, uint32_t vec) {
87    uint32_t sum = 0;
88
89    while (vec > 0) {
90        if (vec & 1) {
91            sum ^= *mat;
92        }
93        vec >>= 1;
94        mat++;
95    }
96    return sum;
97}
98
99/* Multiply a matrix by itself over GF(2).  Both mat and square must have 32
100   rows. The result is written to 'square' */
101static inline void gf2_matrix_square(uint32_t *square, const uint32_t *mat) {
102    int n = 0;
103
104    for (n = 0; n < 32; n++) {
105        square[n] = gf2_matrix_times(mat, mat[n]);
106    }
107}
108
109/* Construct an operator to apply len zeros to a crc.  len must be a power of
110   two.  If len is not a power of two, then the result is the same as for the
111   largest power of two less than len.  The result for len == 0 is the same as
112   for len == 1.  A version of this routine could be easily written for any
113   len, but that is not needed for this application. */
114static void crc32c_zeros_op(uint32_t *even, size_t len) {
115    int n = 0;
116    uint32_t row = 1;
117    uint32_t odd[32];       /* odd-power-of-two zeros operator */
118
119    /* put operator for one zero bit in odd */
120    odd[0] = CRC32C_POLYNOMIAL_REV;              /* CRC-32C polynomial */
121
122    for (n = 1; n < 32; n++) {
123        odd[n] = row;
124        row <<= 1;
125    }
126
127    /* put operator for two zero bits in even */
128    gf2_matrix_square(even, odd);
129
130    /* put operator for four zero bits in odd */
131    gf2_matrix_square(odd, even);
132
133    /* first square will put the operator for one zero byte (eight zero bits),
134       in even -- buf square puts operator for two zero bytes in odd, and so
135       on, until len has been rotated down to zero */
136    do {
137        gf2_matrix_square(even, odd);
138        len >>= 1;
139        if (len == 0) {
140            return;
141        }
142        gf2_matrix_square(odd, even);
143        len >>= 1;
144    } while (len > 0);
145
146    /* answer ended up in odd -- copy to even */
147    for (n = 0; n < 32; n++) {
148        even[n] = odd[n];
149    }
150}
151
152/* Take a length and build four lookup tables for applying the zeros operator
153   for that length, byte-by-byte on the operand. */
154static void crc32c_zeros(uint32_t zeros[SHIFT_TABLE_X][SHIFT_TABLE_Y], size_t len) {
155    uint32_t op[32];
156
157    crc32c_zeros_op(op, len);
158    for (uint32_t n = 0; n < 256; n++) {
159        zeros[0][n] = gf2_matrix_times(op, n);
160        zeros[1][n] = gf2_matrix_times(op, n << 8);
161        zeros[2][n] = gf2_matrix_times(op, n << 16);
162        zeros[3][n] = gf2_matrix_times(op, n << 24);
163    }
164}
165
166// single CRC in software
167static inline uint64_t crc32c_sw_inner(uint64_t crc, const uint8_t* buffer) {
168    crc ^= *reinterpret_cast<const uint64_t*>(buffer);
169    crc = crc32c_sw_lookup_table[7][crc & 0xff] ^
170        crc32c_sw_lookup_table[6][(crc >> 8) & 0xff] ^
171        crc32c_sw_lookup_table[5][(crc >> 16) & 0xff] ^
172        crc32c_sw_lookup_table[4][(crc >> 24) & 0xff] ^
173        crc32c_sw_lookup_table[3][(crc >> 32) & 0xff] ^
174        crc32c_sw_lookup_table[2][(crc >> 40) & 0xff] ^
175        crc32c_sw_lookup_table[1][(crc >> 48) & 0xff] ^
176        crc32c_sw_lookup_table[0][crc >> 56];
177        return crc;
178}
179
180//
181// CRC32-C implementation using software
182// No optimisation
183//
184uint32_t crc32c_sw_1way(const uint8_t* buf, size_t len, uint32_t crc_in) {
185    uint64_t crc = static_cast<uint64_t>(~crc_in);
186
187    while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
188        crc = crc32c_sw_lookup_table[0][(crc ^ *buf) & 0xff] ^ (crc >> 8);
189        buf += sizeof(uint8_t);
190        len -= sizeof(uint8_t);
191    }
192
193    while (len >= sizeof(uint64_t)) {
194        crc  = crc32c_sw_inner(crc, buf);
195        buf += sizeof(uint64_t);
196        len -= sizeof(uint64_t);
197    }
198
199    while (len > 0) {
200        crc = crc32c_sw_lookup_table[0][(crc ^ *buf) & 0xff] ^ (crc >> 8);
201        buf += sizeof(uint8_t);
202        len -= sizeof(uint8_t);
203    }
204
205    return static_cast<uint32_t>(crc ^ std::numeric_limits<uint32_t>::max());
206}
207
208//
209// Partially optimised CRC32C which divides the data into 3 blocks
210// allowing some free CPU pipelining/parallelisation.
211//
212uint32_t crc32c_sw_short_block(const uint8_t* buf, size_t len, uint32_t crc_in) {
213    // If len is less the 3 x SHORT_BLOCK just use the 1-way sw version
214    if (len < (3 * SHORT_BLOCK)) {
215        return crc32c_sw_1way(buf, len, crc_in);
216    }
217
218    uint64_t crc = static_cast<uint64_t>(~crc_in), crc1 = 0, crc2 = 0;
219
220    while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
221        crc = crc32c_sw_lookup_table[0][(crc ^ *buf) & 0xff] ^ (crc >> 8);
222        buf += sizeof(uint8_t);
223        len -= sizeof(uint8_t);
224    }
225
226    // process the data in 3 blocks and combine the crc's using the shift trick
227    while (len >= (3 * SHORT_BLOCK)) {
228        crc1 = 0;
229        crc2 = 0;
230        const uint8_t* end = buf + SHORT_BLOCK;
231        do
232        {
233            crc  = crc32c_sw_inner(crc, buf);
234            crc1 = crc32c_sw_inner(crc1, (buf + SHORT_BLOCK));
235            crc2 = crc32c_sw_inner(crc2, (buf + (2 * SHORT_BLOCK)));
236            buf += sizeof(uint64_t);
237        } while (buf < end);
238        crc = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc)) ^ crc1;
239        crc = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc)) ^ crc2;
240        buf += 2 * SHORT_BLOCK;
241        len -= 3 * SHORT_BLOCK;
242    }
243
244    // swallow any remaining longs.
245    while (len >= sizeof(uint64_t)) {
246        crc = crc32c_sw_inner(crc, buf);
247        buf += sizeof(uint64_t);
248        len -= sizeof(uint64_t);
249    }
250
251    // swallow the remaining bytes.
252    while (len > 0) {
253        crc = crc32c_sw_lookup_table[0][(crc ^ *buf) & 0xff] ^ (crc >> 8);
254        buf += sizeof(uint8_t);
255        len -= sizeof(uint8_t);
256    }
257
258    return static_cast<uint32_t>(crc ^ std::numeric_limits<uint32_t>::max());
259}
260
261//
262// CRC32-C software implementation.
263//
264PLATFORM_PUBLIC_API
265uint32_t crc32c_sw (const uint8_t* buf, size_t len, uint32_t crc_in) {
266    // If len is less than the 3 x LONG_BLOCK it's faster to use the short-block only.
267    if (len < (3 * LONG_BLOCK)) {
268        return crc32c_sw_short_block(buf, len, crc_in);
269    }
270
271    uint64_t crc = static_cast<uint64_t>(~crc_in), crc1 = 0, crc2 = 0;
272
273    while ((reinterpret_cast<uintptr_t>(buf) & ALIGN64_MASK) != 0 && len > 0) {
274        crc = crc32c_sw_lookup_table[0][(crc ^ *buf) & 0xff] ^ (crc >> 8);
275        buf += sizeof(uint8_t);
276        len -= sizeof(uint8_t);
277    }
278
279    // process the data in 3 blocks and combine the crc's using the shift trick
280    while (len >= (3 * LONG_BLOCK)) {
281        crc1 = 0;
282        crc2 = 0;
283        const uint8_t* end = buf + LONG_BLOCK;
284        do
285        {
286            crc  = crc32c_sw_inner(crc, buf);
287            crc1 = crc32c_sw_inner(crc1, (buf + LONG_BLOCK));
288            crc2 = crc32c_sw_inner(crc2, (buf + (2 * LONG_BLOCK)));
289            buf += sizeof(uint64_t);
290        } while (buf < end);
291        crc = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc)) ^ crc1;
292        crc = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc)) ^ crc2;
293        buf += 2 * LONG_BLOCK;
294        len -= 3 * LONG_BLOCK;
295    }
296
297    // process the data in 3 blocks and combine the crc's using the shift trick
298    while (len >= (3 * SHORT_BLOCK)) {
299        crc1 = 0;
300        crc2 = 0;
301        const uint8_t* end = buf + SHORT_BLOCK;
302        do
303        {
304            crc  = crc32c_sw_inner(crc, buf);
305            crc1 = crc32c_sw_inner(crc1, (buf + SHORT_BLOCK));
306            crc2 = crc32c_sw_inner(crc2, (buf + (2 * SHORT_BLOCK)));
307            buf += sizeof(uint64_t);
308        } while (buf < end);
309        crc = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc)) ^ crc1;
310        crc = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc)) ^ crc2;
311        buf += 2 * SHORT_BLOCK;
312        len -= 3 * SHORT_BLOCK;
313    }
314
315    // swallow any remaining longs.
316    while (len >= sizeof(uint64_t)) {
317        crc = crc32c_sw_inner(crc, buf);
318        buf += sizeof(uint64_t);
319        len -= sizeof(uint64_t);
320    }
321
322    // swallow any remaining bytes.
323    while (len > 0) {
324        crc = crc32c_sw_lookup_table[0][(crc ^ *buf) & 0xff] ^ (crc >> 8);
325        buf += sizeof(uint8_t);
326        len -= sizeof(uint8_t);
327    }
328
329    return static_cast<uint32_t>(crc ^ std::numeric_limits<uint32_t>::max());
330}
331
332
333
334//
335// Initialise tables for software and hardware functions.
336//
337bool setup_tables() {
338    uint32_t crc = 0;
339    for (int ii = 0; ii < TABLE_Y; ii++) {
340        crc = ii;
341        for (int jj = 0; jj < TABLE_X; jj++) {
342            crc = crc & 1 ? (crc >> 1) ^ CRC32C_POLYNOMIAL_REV : crc >> 1;
343        }
344        crc32c_sw_lookup_table[0][ii] = crc;
345    }
346
347    for (int ii = 0; ii < TABLE_Y; ii++) {
348        crc = crc32c_sw_lookup_table[0][ii];
349        for (int jj = 1; jj < TABLE_X; jj++) {
350            crc = crc32c_sw_lookup_table[jj][ii] =
351                crc32c_sw_lookup_table[0][crc & 0xff] ^ (crc >> 8);
352        }
353    }
354
355    crc32c_zeros(crc32c_long, LONG_BLOCK);
356    crc32c_zeros(crc32c_short, SHORT_BLOCK);
357
358    (void) tables_setup;
359    return true;
360}
361
362//
363// Return the appropriate function for the platform.
364// If SSE4.2 is available then hardware acceleration is used.
365//
366crc32c_function setup_crc32c() {
367    const uint32_t SSE42 = 0x00100000;
368
369    crc32c_function f = crc32c_sw;
370
371#if defined(WIN32)
372    std::array<int, 4> registers = {{0,0,0,0}};
373    __cpuid(registers.data(), 1);
374#else
375    std::array<uint32_t, 4> registers = {{0,0,0,0}};
376    __get_cpuid(1, &registers[0], &registers[1], &registers[2],&registers[3]);
377#endif
378
379    if (registers[2] & SSE42) {
380        f = crc32c_hw;
381    }
382
383    return f;
384}
385
386static crc32c_function safe_crc32c = setup_crc32c();
387
388//
389// The exported crc32c method uses the function setup_crc32 decided
390// is safe for the platform.
391//
392PLATFORM_PUBLIC_API
393uint32_t crc32c (const uint8_t* buf, size_t len, uint32_t crc_in) {
394    return safe_crc32c(buf, len, crc_in);
395}
396