Back to home page

LXR

 
 

    


File indexing completed on 2025-05-11 08:24:10

0001 /* crc32.c -- compute the CRC-32 of a data stream
0002  * Copyright (C) 1995-2022 Mark Adler
0003  * For conditions of distribution and use, see copyright notice in zlib.h
0004  *
0005  * This interleaved implementation of a CRC makes use of pipelined multiple
0006  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
0007  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
0008  */
0009 
0010 /* @(#) $Id$ */
0011 
0012 /*
0013   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
0014   protection on the static variables used to control the first-use generation
0015   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
0016   first call get_crc_table() to initialize the tables before allowing more than
0017   one thread to use crc32().
0018 
0019   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
0020   produced, so that this one source file can be compiled to an executable.
0021  */
0022 
0023 #ifdef MAKECRCH
0024 #  include <stdio.h>
0025 #  ifndef DYNAMIC_CRC_TABLE
0026 #    define DYNAMIC_CRC_TABLE
0027 #  endif /* !DYNAMIC_CRC_TABLE */
0028 #endif /* MAKECRCH */
0029 
0030 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
0031 
0032  /*
0033   A CRC of a message is computed on N braids of words in the message, where
0034   each word consists of W bytes (4 or 8). If N is 3, for example, then three
0035   running sparse CRCs are calculated respectively on each braid, at these
0036   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
0037   This is done starting at a word boundary, and continues until as many blocks
0038   of N * W bytes as are available have been processed. The results are combined
0039   into a single CRC at the end. For this code, N must be in the range 1..6 and
0040   W must be 4 or 8. The upper limit on N can be increased if desired by adding
0041   more #if blocks, extending the patterns apparent in the code. In addition,
0042   crc32.h would need to be regenerated, if the maximum N value is increased.
0043 
0044   N and W are chosen empirically by benchmarking the execution time on a given
0045   processor. The choices for N and W below were based on testing on Intel Kaby
0046   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
0047   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
0048   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
0049   They were all tested with either gcc or clang, all using the -O3 optimization
0050   level. Your mileage may vary.
0051  */
0052 
0053 /* Define N */
0054 #ifdef Z_TESTN
0055 #  define N Z_TESTN
0056 #else
0057 #  define N 5
0058 #endif
0059 #if N < 1 || N > 6
0060 #  error N must be in 1..6
0061 #endif
0062 
0063 /*
0064   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
0065   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
0066   that bytes are eight bits.
0067  */
0068 
0069 /*
0070   Define W and the associated z_word_t type. If W is not defined, then a
0071   braided calculation is not used, and the associated tables and code are not
0072   compiled.
0073  */
0074 #ifdef Z_TESTW
0075 #  if Z_TESTW-1 != -1
0076 #    define W Z_TESTW
0077 #  endif
0078 #else
0079 #  ifdef MAKECRCH
0080 #    define W 8         /* required for MAKECRCH */
0081 #  else
0082 #    if defined(__x86_64__) || defined(__aarch64__)
0083 #      define W 8
0084 #    else
0085 #      define W 4
0086 #    endif
0087 #  endif
0088 #endif
0089 #ifdef W
0090 #  if W == 8 && defined(Z_U8)
0091      typedef Z_U8 z_word_t;
0092 #  elif defined(Z_U4)
0093 #    undef W
0094 #    define W 4
0095      typedef Z_U4 z_word_t;
0096 #  else
0097 #    undef W
0098 #  endif
0099 #endif
0100 
0101 /* If available, use the ARM processor CRC32 instruction. */
0102 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
0103 #  define ARMCRC32
0104 #endif
0105 
0106 /* Local functions. */
0107 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
0108 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
0109 
0110 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
0111     local z_word_t byte_swap OF((z_word_t word));
0112 #endif
0113 
0114 #if defined(W) && !defined(ARMCRC32)
0115     local z_crc_t crc_word OF((z_word_t data));
0116     local z_word_t crc_word_big OF((z_word_t data));
0117 #endif
0118 
0119 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
0120 /*
0121   Swap the bytes in a z_word_t to convert between little and big endian. Any
0122   self-respecting compiler will optimize this to a single machine byte-swap
0123   instruction, if one is available. This assumes that word_t is either 32 bits
0124   or 64 bits.
0125  */
0126 local z_word_t byte_swap(word)
0127     z_word_t word;
0128 {
0129 #  if W == 8
0130     return
0131         (word & 0xff00000000000000) >> 56 |
0132         (word & 0xff000000000000) >> 40 |
0133         (word & 0xff0000000000) >> 24 |
0134         (word & 0xff00000000) >> 8 |
0135         (word & 0xff000000) << 8 |
0136         (word & 0xff0000) << 24 |
0137         (word & 0xff00) << 40 |
0138         (word & 0xff) << 56;
0139 #  else   /* W == 4 */
0140     return
0141         (word & 0xff000000) >> 24 |
0142         (word & 0xff0000) >> 8 |
0143         (word & 0xff00) << 8 |
0144         (word & 0xff) << 24;
0145 #  endif
0146 }
0147 #endif
0148 
0149 /* CRC polynomial. */
0150 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
0151 
0152 #ifdef DYNAMIC_CRC_TABLE
0153 
0154 local z_crc_t FAR crc_table[256];
0155 local z_crc_t FAR x2n_table[32];
0156 local void make_crc_table OF((void));
0157 #ifdef W
0158    local z_word_t FAR crc_big_table[256];
0159    local z_crc_t FAR crc_braid_table[W][256];
0160    local z_word_t FAR crc_braid_big_table[W][256];
0161    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
0162 #endif
0163 #ifdef MAKECRCH
0164    local void write_table OF((FILE *, const z_crc_t FAR *, int));
0165    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
0166    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
0167 #endif /* MAKECRCH */
0168 
0169 /*
0170   Define a once() function depending on the availability of atomics. If this is
0171   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
0172   multiple threads, and if atomics are not available, then get_crc_table() must
0173   be called to initialize the tables and must return before any threads are
0174   allowed to compute or combine CRCs.
0175  */
0176 
0177 /* Definition of once functionality. */
0178 typedef struct once_s once_t;
0179 local void once OF((once_t *, void (*)(void)));
0180 
0181 /* Check for the availability of atomics. */
0182 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
0183     !defined(__STDC_NO_ATOMICS__)
0184 
0185 #include <stdatomic.h>
0186 
0187 /* Structure for once(), which must be initialized with ONCE_INIT. */
0188 struct once_s {
0189     atomic_flag begun;
0190     atomic_int done;
0191 };
0192 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
0193 
0194 /*
0195   Run the provided init() function exactly once, even if multiple threads
0196   invoke once() at the same time. The state must be a once_t initialized with
0197   ONCE_INIT.
0198  */
0199 local void once(state, init)
0200     once_t *state;
0201     void (*init)(void);
0202 {
0203     if (!atomic_load(&state->done)) {
0204         if (atomic_flag_test_and_set(&state->begun))
0205             while (!atomic_load(&state->done))
0206                 ;
0207         else {
0208             init();
0209             atomic_store(&state->done, 1);
0210         }
0211     }
0212 }
0213 
0214 #else   /* no atomics */
0215 
0216 /* Structure for once(), which must be initialized with ONCE_INIT. */
0217 struct once_s {
0218     volatile int begun;
0219     volatile int done;
0220 };
0221 #define ONCE_INIT {0, 0}
0222 
0223 /* Test and set. Alas, not atomic, but tries to minimize the period of
0224    vulnerability. */
0225 local int test_and_set OF((int volatile *));
0226 local int test_and_set(flag)
0227     int volatile *flag;
0228 {
0229     int was;
0230 
0231     was = *flag;
0232     *flag = 1;
0233     return was;
0234 }
0235 
0236 /* Run the provided init() function once. This is not thread-safe. */
0237 local void once(state, init)
0238     once_t *state;
0239     void (*init)(void);
0240 {
0241     if (!state->done) {
0242         if (test_and_set(&state->begun))
0243             while (!state->done)
0244                 ;
0245         else {
0246             init();
0247             state->done = 1;
0248         }
0249     }
0250 }
0251 
0252 #endif
0253 
0254 /* State for once(). */
0255 local once_t made = ONCE_INIT;
0256 
0257 /*
0258   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
0259   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
0260 
0261   Polynomials over GF(2) are represented in binary, one bit per coefficient,
0262   with the lowest powers in the most significant bit. Then adding polynomials
0263   is just exclusive-or, and multiplying a polynomial by x is a right shift by
0264   one. If we call the above polynomial p, and represent a byte as the
0265   polynomial q, also with the lowest power in the most significant bit (so the
0266   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
0267   where a mod b means the remainder after dividing a by b.
0268 
0269   This calculation is done using the shift-register method of multiplying and
0270   taking the remainder. The register is initialized to zero, and for each
0271   incoming bit, x^32 is added mod p to the register if the bit is a one (where
0272   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
0273   (which is shifting right by one and adding x^32 mod p if the bit shifted out
0274   is a one). We start with the highest power (least significant bit) of q and
0275   repeat for all eight bits of q.
0276 
0277   The table is simply the CRC of all possible eight bit values. This is all the
0278   information needed to generate CRCs on data a byte at a time for all
0279   combinations of CRC register values and incoming bytes.
0280  */
0281 
0282 local void make_crc_table()
0283 {
0284     unsigned i, j, n;
0285     z_crc_t p;
0286 
0287     /* initialize the CRC of bytes tables */
0288     for (i = 0; i < 256; i++) {
0289         p = i;
0290         for (j = 0; j < 8; j++)
0291             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
0292         crc_table[i] = p;
0293 #ifdef W
0294         crc_big_table[i] = byte_swap(p);
0295 #endif
0296     }
0297 
0298     /* initialize the x^2^n mod p(x) table */
0299     p = (z_crc_t)1 << 30;         /* x^1 */
0300     x2n_table[0] = p;
0301     for (n = 1; n < 32; n++)
0302         x2n_table[n] = p = multmodp(p, p);
0303 
0304 #ifdef W
0305     /* initialize the braiding tables -- needs x2n_table[] */
0306     braid(crc_braid_table, crc_braid_big_table, N, W);
0307 #endif
0308 
0309 #ifdef MAKECRCH
0310     {
0311         /*
0312           The crc32.h header file contains tables for both 32-bit and 64-bit
0313           z_word_t's, and so requires a 64-bit type be available. In that case,
0314           z_word_t must be defined to be 64-bits. This code then also generates
0315           and writes out the tables for the case that z_word_t is 32 bits.
0316          */
0317 #if !defined(W) || W != 8
0318 #  error Need a 64-bit integer type in order to generate crc32.h.
0319 #endif
0320         FILE *out;
0321         int k, n;
0322         z_crc_t ltl[8][256];
0323         z_word_t big[8][256];
0324 
0325         out = fopen("crc32.h", "w");
0326         if (out == NULL) return;
0327 
0328         /* write out little-endian CRC table to crc32.h */
0329         fprintf(out,
0330             "/* crc32.h -- tables for rapid CRC calculation\n"
0331             " * Generated automatically by crc32.c\n */\n"
0332             "\n"
0333             "local const z_crc_t FAR crc_table[] = {\n"
0334             "    ");
0335         write_table(out, crc_table, 256);
0336         fprintf(out,
0337             "};\n");
0338 
0339         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
0340         fprintf(out,
0341             "\n"
0342             "#ifdef W\n"
0343             "\n"
0344             "#if W == 8\n"
0345             "\n"
0346             "local const z_word_t FAR crc_big_table[] = {\n"
0347             "    ");
0348         write_table64(out, crc_big_table, 256);
0349         fprintf(out,
0350             "};\n");
0351 
0352         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
0353         fprintf(out,
0354             "\n"
0355             "#else /* W == 4 */\n"
0356             "\n"
0357             "local const z_word_t FAR crc_big_table[] = {\n"
0358             "    ");
0359         write_table32hi(out, crc_big_table, 256);
0360         fprintf(out,
0361             "};\n"
0362             "\n"
0363             "#endif\n");
0364 
0365         /* write out braid tables for each value of N */
0366         for (n = 1; n <= 6; n++) {
0367             fprintf(out,
0368             "\n"
0369             "#if N == %d\n", n);
0370 
0371             /* compute braid tables for this N and 64-bit word_t */
0372             braid(ltl, big, n, 8);
0373 
0374             /* write out braid tables for 64-bit z_word_t to crc32.h */
0375             fprintf(out,
0376             "\n"
0377             "#if W == 8\n"
0378             "\n"
0379             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
0380             for (k = 0; k < 8; k++) {
0381                 fprintf(out, "   {");
0382                 write_table(out, ltl[k], 256);
0383                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
0384             }
0385             fprintf(out,
0386             "};\n"
0387             "\n"
0388             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
0389             for (k = 0; k < 8; k++) {
0390                 fprintf(out, "   {");
0391                 write_table64(out, big[k], 256);
0392                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
0393             }
0394             fprintf(out,
0395             "};\n");
0396 
0397             /* compute braid tables for this N and 32-bit word_t */
0398             braid(ltl, big, n, 4);
0399 
0400             /* write out braid tables for 32-bit z_word_t to crc32.h */
0401             fprintf(out,
0402             "\n"
0403             "#else /* W == 4 */\n"
0404             "\n"
0405             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
0406             for (k = 0; k < 4; k++) {
0407                 fprintf(out, "   {");
0408                 write_table(out, ltl[k], 256);
0409                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
0410             }
0411             fprintf(out,
0412             "};\n"
0413             "\n"
0414             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
0415             for (k = 0; k < 4; k++) {
0416                 fprintf(out, "   {");
0417                 write_table32hi(out, big[k], 256);
0418                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
0419             }
0420             fprintf(out,
0421             "};\n"
0422             "\n"
0423             "#endif\n"
0424             "\n"
0425             "#endif\n");
0426         }
0427         fprintf(out,
0428             "\n"
0429             "#endif\n");
0430 
0431         /* write out zeros operator table to crc32.h */
0432         fprintf(out,
0433             "\n"
0434             "local const z_crc_t FAR x2n_table[] = {\n"
0435             "    ");
0436         write_table(out, x2n_table, 32);
0437         fprintf(out,
0438             "};\n");
0439         fclose(out);
0440     }
0441 #endif /* MAKECRCH */
0442 }
0443 
0444 #ifdef MAKECRCH
0445 
0446 /*
0447    Write the 32-bit values in table[0..k-1] to out, five per line in
0448    hexadecimal separated by commas.
0449  */
0450 local void write_table(out, table, k)
0451     FILE *out;
0452     const z_crc_t FAR *table;
0453     int k;
0454 {
0455     int n;
0456 
0457     for (n = 0; n < k; n++)
0458         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
0459                 (unsigned long)(table[n]),
0460                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
0461 }
0462 
0463 /*
0464    Write the high 32-bits of each value in table[0..k-1] to out, five per line
0465    in hexadecimal separated by commas.
0466  */
0467 local void write_table32hi(out, table, k)
0468 FILE *out;
0469 const z_word_t FAR *table;
0470 int k;
0471 {
0472     int n;
0473 
0474     for (n = 0; n < k; n++)
0475         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
0476                 (unsigned long)(table[n] >> 32),
0477                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
0478 }
0479 
0480 /*
0481   Write the 64-bit values in table[0..k-1] to out, three per line in
0482   hexadecimal separated by commas. This assumes that if there is a 64-bit
0483   type, then there is also a long long integer type, and it is at least 64
0484   bits. If not, then the type cast and format string can be adjusted
0485   accordingly.
0486  */
0487 local void write_table64(out, table, k)
0488     FILE *out;
0489     const z_word_t FAR *table;
0490     int k;
0491 {
0492     int n;
0493 
0494     for (n = 0; n < k; n++)
0495         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
0496                 (unsigned long long)(table[n]),
0497                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
0498 }
0499 
0500 /* Actually do the deed. */
0501 int main()
0502 {
0503     make_crc_table();
0504     return 0;
0505 }
0506 
0507 #endif /* MAKECRCH */
0508 
0509 #ifdef W
0510 /*
0511   Generate the little and big-endian braid tables for the given n and z_word_t
0512   size w. Each array must have room for w blocks of 256 elements.
0513  */
0514 local void braid(ltl, big, n, w)
0515     z_crc_t ltl[][256];
0516     z_word_t big[][256];
0517     int n;
0518     int w;
0519 {
0520     int k;
0521     z_crc_t i, p, q;
0522     for (k = 0; k < w; k++) {
0523         p = x2nmodp((n * w + 3 - k) << 3, 0);
0524         ltl[k][0] = 0;
0525         big[w - 1 - k][0] = 0;
0526         for (i = 1; i < 256; i++) {
0527             ltl[k][i] = q = multmodp(i << 24, p);
0528             big[w - 1 - k][i] = byte_swap(q);
0529         }
0530     }
0531 }
0532 #endif
0533 
0534 #else /* !DYNAMIC_CRC_TABLE */
0535 /* ========================================================================
0536  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
0537  * of x for combining CRC-32s, all made by make_crc_table().
0538  */
0539 #include "crc32.h"
0540 #endif /* DYNAMIC_CRC_TABLE */
0541 
0542 /* ========================================================================
0543  * Routines used for CRC calculation. Some are also required for the table
0544  * generation above.
0545  */
0546 
0547 /*
0548   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
0549   reflected. For speed, this requires that a not be zero.
0550  */
0551 local z_crc_t multmodp(a, b)
0552     z_crc_t a;
0553     z_crc_t b;
0554 {
0555     z_crc_t m, p;
0556 
0557     m = (z_crc_t)1 << 31;
0558     p = 0;
0559     for (;;) {
0560         if (a & m) {
0561             p ^= b;
0562             if ((a & (m - 1)) == 0)
0563                 break;
0564         }
0565         m >>= 1;
0566         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
0567     }
0568     return p;
0569 }
0570 
0571 /*
0572   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
0573   initialized.
0574  */
0575 local z_crc_t x2nmodp(n, k)
0576     z_off64_t n;
0577     unsigned k;
0578 {
0579     z_crc_t p;
0580 
0581     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
0582     while (n) {
0583         if (n & 1)
0584             p = multmodp(x2n_table[k & 31], p);
0585         n >>= 1;
0586         k++;
0587     }
0588     return p;
0589 }
0590 
0591 /* =========================================================================
0592  * This function can be used by asm versions of crc32(), and to force the
0593  * generation of the CRC tables in a threaded application.
0594  */
0595 const z_crc_t FAR * ZEXPORT get_crc_table()
0596 {
0597 #ifdef DYNAMIC_CRC_TABLE
0598     once(&made, make_crc_table);
0599 #endif /* DYNAMIC_CRC_TABLE */
0600     return (const z_crc_t FAR *)crc_table;
0601 }
0602 
0603 /* =========================================================================
0604  * Use ARM machine instructions if available. This will compute the CRC about
0605  * ten times faster than the braided calculation. This code does not check for
0606  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
0607  * only be defined if the compilation specifies an ARM processor architecture
0608  * that has the instructions. For example, compiling with -march=armv8.1-a or
0609  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
0610  * instructions.
0611  */
0612 #ifdef ARMCRC32
0613 
0614 /*
0615    Constants empirically determined to maximize speed. These values are from
0616    measurements on a Cortex-A57. Your mileage may vary.
0617  */
0618 #define Z_BATCH 3990                /* number of words in a batch */
0619 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
0620 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
0621 
0622 unsigned long ZEXPORT crc32_z(crc, buf, len)
0623     unsigned long crc;
0624     const unsigned char FAR *buf;
0625     z_size_t len;
0626 {
0627     z_crc_t val;
0628     z_word_t crc1, crc2;
0629     const z_word_t *word;
0630     z_word_t val0, val1, val2;
0631     z_size_t last, last2, i;
0632     z_size_t num;
0633 
0634     /* Return initial CRC, if requested. */
0635     if (buf == Z_NULL) return 0;
0636 
0637 #ifdef DYNAMIC_CRC_TABLE
0638     once(&made, make_crc_table);
0639 #endif /* DYNAMIC_CRC_TABLE */
0640 
0641     /* Pre-condition the CRC */
0642     crc = (~crc) & 0xffffffff;
0643 
0644     /* Compute the CRC up to a word boundary. */
0645     while (len && ((z_size_t)buf & 7) != 0) {
0646         len--;
0647         val = *buf++;
0648         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
0649     }
0650 
0651     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
0652     word = (z_word_t const *)buf;
0653     num = len >> 3;
0654     len &= 7;
0655 
0656     /* Do three interleaved CRCs to realize the throughput of one crc32x
0657        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
0658        three CRCs are combined into a single CRC after each set of batches. */
0659     while (num >= 3 * Z_BATCH) {
0660         crc1 = 0;
0661         crc2 = 0;
0662         for (i = 0; i < Z_BATCH; i++) {
0663             val0 = word[i];
0664             val1 = word[i + Z_BATCH];
0665             val2 = word[i + 2 * Z_BATCH];
0666             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
0667             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
0668             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
0669         }
0670         word += 3 * Z_BATCH;
0671         num -= 3 * Z_BATCH;
0672         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
0673         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
0674     }
0675 
0676     /* Do one last smaller batch with the remaining words, if there are enough
0677        to pay for the combination of CRCs. */
0678     last = num / 3;
0679     if (last >= Z_BATCH_MIN) {
0680         last2 = last << 1;
0681         crc1 = 0;
0682         crc2 = 0;
0683         for (i = 0; i < last; i++) {
0684             val0 = word[i];
0685             val1 = word[i + last];
0686             val2 = word[i + last2];
0687             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
0688             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
0689             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
0690         }
0691         word += 3 * last;
0692         num -= 3 * last;
0693         val = x2nmodp(last, 6);
0694         crc = multmodp(val, crc) ^ crc1;
0695         crc = multmodp(val, crc) ^ crc2;
0696     }
0697 
0698     /* Compute the CRC on any remaining words. */
0699     for (i = 0; i < num; i++) {
0700         val0 = word[i];
0701         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
0702     }
0703     word += num;
0704 
0705     /* Complete the CRC on any remaining bytes. */
0706     buf = (const unsigned char FAR *)word;
0707     while (len) {
0708         len--;
0709         val = *buf++;
0710         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
0711     }
0712 
0713     /* Return the CRC, post-conditioned. */
0714     return crc ^ 0xffffffff;
0715 }
0716 
0717 #else
0718 
0719 #ifdef W
0720 
0721 /*
0722   Return the CRC of the W bytes in the word_t data, taking the
0723   least-significant byte of the word as the first byte of data, without any pre
0724   or post conditioning. This is used to combine the CRCs of each braid.
0725  */
0726 local z_crc_t crc_word(data)
0727     z_word_t data;
0728 {
0729     int k;
0730     for (k = 0; k < W; k++)
0731         data = (data >> 8) ^ crc_table[data & 0xff];
0732     return (z_crc_t)data;
0733 }
0734 
0735 local z_word_t crc_word_big(data)
0736     z_word_t data;
0737 {
0738     int k;
0739     for (k = 0; k < W; k++)
0740         data = (data << 8) ^
0741             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
0742     return data;
0743 }
0744 
0745 #endif
0746 
0747 /* ========================================================================= */
0748 unsigned long ZEXPORT crc32_z(crc, buf, len)
0749     unsigned long crc;
0750     const unsigned char FAR *buf;
0751     z_size_t len;
0752 {
0753     /* Return initial CRC, if requested. */
0754     if (buf == Z_NULL) return 0;
0755 
0756 #ifdef DYNAMIC_CRC_TABLE
0757     once(&made, make_crc_table);
0758 #endif /* DYNAMIC_CRC_TABLE */
0759 
0760     /* Pre-condition the CRC */
0761     crc = (~crc) & 0xffffffff;
0762 
0763 #ifdef W
0764 
0765     /* If provided enough bytes, do a braided CRC calculation. */
0766     if (len >= N * W + W - 1) {
0767         z_size_t blks;
0768         z_word_t const *words;
0769         unsigned endian;
0770         int k;
0771 
0772         /* Compute the CRC up to a z_word_t boundary. */
0773         while (len && ((z_size_t)buf & (W - 1)) != 0) {
0774             len--;
0775             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
0776         }
0777 
0778         /* Compute the CRC on as many N z_word_t blocks as are available. */
0779         blks = len / (N * W);
0780         len -= blks * N * W;
0781         words = (z_word_t const *)buf;
0782 
0783         /* Do endian check at execution time instead of compile time, since ARM
0784            processors can change the endianess at execution time. If the
0785            compiler knows what the endianess will be, it can optimize out the
0786            check and the unused branch. */
0787         endian = 1;
0788         if (*(unsigned char *)&endian) {
0789             /* Little endian. */
0790 
0791             z_crc_t crc0;
0792             z_word_t word0;
0793 #if N > 1
0794             z_crc_t crc1;
0795             z_word_t word1;
0796 #if N > 2
0797             z_crc_t crc2;
0798             z_word_t word2;
0799 #if N > 3
0800             z_crc_t crc3;
0801             z_word_t word3;
0802 #if N > 4
0803             z_crc_t crc4;
0804             z_word_t word4;
0805 #if N > 5
0806             z_crc_t crc5;
0807             z_word_t word5;
0808 #endif
0809 #endif
0810 #endif
0811 #endif
0812 #endif
0813 
0814             /* Initialize the CRC for each braid. */
0815             crc0 = crc;
0816 #if N > 1
0817             crc1 = 0;
0818 #if N > 2
0819             crc2 = 0;
0820 #if N > 3
0821             crc3 = 0;
0822 #if N > 4
0823             crc4 = 0;
0824 #if N > 5
0825             crc5 = 0;
0826 #endif
0827 #endif
0828 #endif
0829 #endif
0830 #endif
0831 
0832             /*
0833               Process the first blks-1 blocks, computing the CRCs on each braid
0834               independently.
0835              */
0836             while (--blks) {
0837                 /* Load the word for each braid into registers. */
0838                 word0 = crc0 ^ words[0];
0839 #if N > 1
0840                 word1 = crc1 ^ words[1];
0841 #if N > 2
0842                 word2 = crc2 ^ words[2];
0843 #if N > 3
0844                 word3 = crc3 ^ words[3];
0845 #if N > 4
0846                 word4 = crc4 ^ words[4];
0847 #if N > 5
0848                 word5 = crc5 ^ words[5];
0849 #endif
0850 #endif
0851 #endif
0852 #endif
0853 #endif
0854                 words += N;
0855 
0856                 /* Compute and update the CRC for each word. The loop should
0857                    get unrolled. */
0858                 crc0 = crc_braid_table[0][word0 & 0xff];
0859 #if N > 1
0860                 crc1 = crc_braid_table[0][word1 & 0xff];
0861 #if N > 2
0862                 crc2 = crc_braid_table[0][word2 & 0xff];
0863 #if N > 3
0864                 crc3 = crc_braid_table[0][word3 & 0xff];
0865 #if N > 4
0866                 crc4 = crc_braid_table[0][word4 & 0xff];
0867 #if N > 5
0868                 crc5 = crc_braid_table[0][word5 & 0xff];
0869 #endif
0870 #endif
0871 #endif
0872 #endif
0873 #endif
0874                 for (k = 1; k < W; k++) {
0875                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
0876 #if N > 1
0877                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
0878 #if N > 2
0879                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
0880 #if N > 3
0881                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
0882 #if N > 4
0883                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
0884 #if N > 5
0885                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
0886 #endif
0887 #endif
0888 #endif
0889 #endif
0890 #endif
0891                 }
0892             }
0893 
0894             /*
0895               Process the last block, combining the CRCs of the N braids at the
0896               same time.
0897              */
0898             crc = crc_word(crc0 ^ words[0]);
0899 #if N > 1
0900             crc = crc_word(crc1 ^ words[1] ^ crc);
0901 #if N > 2
0902             crc = crc_word(crc2 ^ words[2] ^ crc);
0903 #if N > 3
0904             crc = crc_word(crc3 ^ words[3] ^ crc);
0905 #if N > 4
0906             crc = crc_word(crc4 ^ words[4] ^ crc);
0907 #if N > 5
0908             crc = crc_word(crc5 ^ words[5] ^ crc);
0909 #endif
0910 #endif
0911 #endif
0912 #endif
0913 #endif
0914             words += N;
0915         }
0916         else {
0917             /* Big endian. */
0918 
0919             z_word_t crc0, word0, comb;
0920 #if N > 1
0921             z_word_t crc1, word1;
0922 #if N > 2
0923             z_word_t crc2, word2;
0924 #if N > 3
0925             z_word_t crc3, word3;
0926 #if N > 4
0927             z_word_t crc4, word4;
0928 #if N > 5
0929             z_word_t crc5, word5;
0930 #endif
0931 #endif
0932 #endif
0933 #endif
0934 #endif
0935 
0936             /* Initialize the CRC for each braid. */
0937             crc0 = byte_swap(crc);
0938 #if N > 1
0939             crc1 = 0;
0940 #if N > 2
0941             crc2 = 0;
0942 #if N > 3
0943             crc3 = 0;
0944 #if N > 4
0945             crc4 = 0;
0946 #if N > 5
0947             crc5 = 0;
0948 #endif
0949 #endif
0950 #endif
0951 #endif
0952 #endif
0953 
0954             /*
0955               Process the first blks-1 blocks, computing the CRCs on each braid
0956               independently.
0957              */
0958             while (--blks) {
0959                 /* Load the word for each braid into registers. */
0960                 word0 = crc0 ^ words[0];
0961 #if N > 1
0962                 word1 = crc1 ^ words[1];
0963 #if N > 2
0964                 word2 = crc2 ^ words[2];
0965 #if N > 3
0966                 word3 = crc3 ^ words[3];
0967 #if N > 4
0968                 word4 = crc4 ^ words[4];
0969 #if N > 5
0970                 word5 = crc5 ^ words[5];
0971 #endif
0972 #endif
0973 #endif
0974 #endif
0975 #endif
0976                 words += N;
0977 
0978                 /* Compute and update the CRC for each word. The loop should
0979                    get unrolled. */
0980                 crc0 = crc_braid_big_table[0][word0 & 0xff];
0981 #if N > 1
0982                 crc1 = crc_braid_big_table[0][word1 & 0xff];
0983 #if N > 2
0984                 crc2 = crc_braid_big_table[0][word2 & 0xff];
0985 #if N > 3
0986                 crc3 = crc_braid_big_table[0][word3 & 0xff];
0987 #if N > 4
0988                 crc4 = crc_braid_big_table[0][word4 & 0xff];
0989 #if N > 5
0990                 crc5 = crc_braid_big_table[0][word5 & 0xff];
0991 #endif
0992 #endif
0993 #endif
0994 #endif
0995 #endif
0996                 for (k = 1; k < W; k++) {
0997                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
0998 #if N > 1
0999                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
1000 #if N > 2
1001                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
1002 #if N > 3
1003                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1004 #if N > 4
1005                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1006 #if N > 5
1007                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1008 #endif
1009 #endif
1010 #endif
1011 #endif
1012 #endif
1013                 }
1014             }
1015 
1016             /*
1017               Process the last block, combining the CRCs of the N braids at the
1018               same time.
1019              */
1020             comb = crc_word_big(crc0 ^ words[0]);
1021 #if N > 1
1022             comb = crc_word_big(crc1 ^ words[1] ^ comb);
1023 #if N > 2
1024             comb = crc_word_big(crc2 ^ words[2] ^ comb);
1025 #if N > 3
1026             comb = crc_word_big(crc3 ^ words[3] ^ comb);
1027 #if N > 4
1028             comb = crc_word_big(crc4 ^ words[4] ^ comb);
1029 #if N > 5
1030             comb = crc_word_big(crc5 ^ words[5] ^ comb);
1031 #endif
1032 #endif
1033 #endif
1034 #endif
1035 #endif
1036             words += N;
1037             crc = byte_swap(comb);
1038         }
1039 
1040         /*
1041           Update the pointer to the remaining bytes to process.
1042          */
1043         buf = (unsigned char const *)words;
1044     }
1045 
1046 #endif /* W */
1047 
1048     /* Complete the computation of the CRC on any remaining bytes. */
1049     while (len >= 8) {
1050         len -= 8;
1051         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1052         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1053         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1054         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1055         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1056         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1057         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1058         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1059     }
1060     while (len) {
1061         len--;
1062         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1063     }
1064 
1065     /* Return the CRC, post-conditioned. */
1066     return crc ^ 0xffffffff;
1067 }
1068 
1069 #endif
1070 
1071 /* ========================================================================= */
1072 unsigned long ZEXPORT crc32(crc, buf, len)
1073     unsigned long crc;
1074     const unsigned char FAR *buf;
1075     uInt len;
1076 {
1077     return crc32_z(crc, buf, len);
1078 }
1079 
1080 /* ========================================================================= */
1081 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1082     uLong crc1;
1083     uLong crc2;
1084     z_off64_t len2;
1085 {
1086 #ifdef DYNAMIC_CRC_TABLE
1087     once(&made, make_crc_table);
1088 #endif /* DYNAMIC_CRC_TABLE */
1089     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1090 }
1091 
1092 /* ========================================================================= */
1093 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1094     uLong crc1;
1095     uLong crc2;
1096     z_off_t len2;
1097 {
1098     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1099 }
1100 
1101 /* ========================================================================= */
1102 uLong ZEXPORT crc32_combine_gen64(len2)
1103     z_off64_t len2;
1104 {
1105 #ifdef DYNAMIC_CRC_TABLE
1106     once(&made, make_crc_table);
1107 #endif /* DYNAMIC_CRC_TABLE */
1108     return x2nmodp(len2, 3);
1109 }
1110 
1111 /* ========================================================================= */
1112 uLong ZEXPORT crc32_combine_gen(len2)
1113     z_off_t len2;
1114 {
1115     return crc32_combine_gen64((z_off64_t)len2);
1116 }
1117 
1118 /* ========================================================================= */
1119 uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
1120     uLong crc1;
1121     uLong crc2;
1122     uLong op;
1123 {
1124     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1125 }