src/rfs/rtems-rfs-dir-hash.c

0001 /* SPDX-License-Identifier: BSD-2-Clause */
0002
0003 /**
0004  * @file
0005  *
0006  * @ingroup rtems_rfs
0007  *
0008  * @brief RTEMS File Systems Directory Hash function
0009  */
0010
0011 /*
0012  * Copyright (C) 2010 Chris Johns <chrisj@rtems.org>
0013  *
0014  * Redistribution and use in source and binary forms, with or without
0015  * modification, are permitted provided that the following conditions
0016  * are met:
0017  * 1. Redistributions of source code must retain the above copyright
0018  *    notice, this list of conditions and the following disclaimer.
0019  * 2. Redistributions in binary form must reproduce the above copyright
0020  *    notice, this list of conditions and the following disclaimer in the
0021  *    documentation and/or other materials provided with the distribution.
0022  *
0023  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
0024  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0025  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
0026  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
0027  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
0028  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
0029  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
0030  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
0031  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
0032  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
0033  * POSSIBILITY OF SUCH DAMAGE.
0034  */
0035
0036 #ifdef HAVE_CONFIG_H
0037 #include "config.h"
0038 #endif
0039
0040 #include <rtems/rfs/rtems-rfs-dir-hash.h>
0041
0042 #ifdef __rtems__
0043 # include <machine/endian.h>    /* attempt to define endianness */
0044 #endif
0045 #ifdef linux
0046 # include <endian.h>    /* attempt to define endianness */
0047 #endif
0048
0049 /*
0050  * My best guess at if you are big-endian or little-endian.  This may
0051  * need adjustment.
0052  */
0053 #if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
0054      __BYTE_ORDER == __LITTLE_ENDIAN) || \
0055   (defined(i386) || defined(__i386__) || defined(__i486__) || \
0056    defined(__i586__) || defined(__i686__) || defined(vax) || defined(MIPSEL))
0057 # define HASH_LITTLE_ENDIAN 1
0058 # define HASH_BIG_ENDIAN 0
0059 #elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
0060        __BYTE_ORDER == __BIG_ENDIAN) || \
0061   (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel))
0062 # define HASH_LITTLE_ENDIAN 0
0063 # define HASH_BIG_ENDIAN 1
0064 #else
0065 # define HASH_LITTLE_ENDIAN 0
0066 # define HASH_BIG_ENDIAN 0
0067 #endif
0068
0069 #define hashsize(n) ((uint32_t)1<<(n))
0070 #define hashmask(n) (hashsize(n)-1)
0071 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
0072
0073 /*
0074   -------------------------------------------------------------------------------
0075   mix -- mix 3 32-bit values reversibly.
0076
0077   This is reversible, so any information in (a,b,c) before mix() is still in
0078   (a,b,c) after mix().
0079
0080   If four pairs of (a,b,c) inputs are run through mix(), or through mix() in
0081   reverse, there are at least 32 bits of the output that are sometimes the same
0082   for one pair and different for another pair.  This was tested for:
0083
0084   * pairs that differed by one bit, by two bits, in any combination of top bits
0085     of (a,b,c), or in any combination of bottom bits of (a,b,c).
0086
0087   * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed the
0088     output delta to a Gray code (a^(a>>1)) so a string of 1's (as is commonly
0089     produced by subtraction) look like a single 1-bit difference.
0090
0091   * the base values were pseudorandom, all zero but one bit set, or all zero
0092     plus a counter that starts at zero.
0093
0094   Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that satisfy this
0095   are:
0096
0097      4  6  8 16 19  4
0098      9 15  3 18 27 15
0099     14  9  3  7 17  3
0100
0101   Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing for "differ" defined
0102   as + with a one-bit base and a two-bit delta.  I used
0103   http://burtleburtle.net/bob/hash/avalanche.html to choose the operations,
0104   constants, and arrangements of the variables.
0105
0106   This does not achieve avalanche.  There are input bits of (a,b,c) that fail
0107   to affect some output bits of (a,b,c), especially of a.  The most thoroughly
0108   mixed value is c, but it doesn't really even achieve avalanche in c.
0109
0110   This allows some parallelism.  Read-after-writes are good at doubling the
0111   number of bits affected, so the goal of mixing pulls in the opposite
0112   direction as the goal of parallelism.  I did what I could.  Rotates seem to
0113   cost as much as shifts on every machine I could lay my hands on, and rotates
0114   are much kinder to the top and bottom bits, so I used rotates.
0115   -------------------------------------------------------------------------------
0116 */
0117 #define mix(a,b,c) \
0118   { \
0119   a -= c;  a ^= rot(c, 4);  c += b; \
0120   b -= a;  b ^= rot(a, 6);  a += c; \
0121   c -= b;  c ^= rot(b, 8);  b += a; \
0122   a -= c;  a ^= rot(c,16);  c += b; \
0123   b -= a;  b ^= rot(a,19);  a += c; \
0124   c -= b;  c ^= rot(b, 4);  b += a; \
0125   }
0126
0127 /*
0128   -------------------------------------------------------------------------------
0129   final -- final mixing of 3 32-bit values (a,b,c) into c
0130
0131   Pairs of (a,b,c) values differing in only a few bits will usually produce
0132   values of c that look totally different.  This was tested for
0133
0134   * pairs that differed by one bit, by two bits, in any combination of top bits
0135     of (a,b,c), or in any combination of bottom bits of (a,b,c).
0136
0137   * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed the
0138     output delta to a Gray code (a^(a>>1)) so a string of 1's (as is commonly
0139     produced by subtraction) look like a single 1-bit difference.  * the base
0140     values were pseudorandom, all zero but one bit set, or all zero plus a
0141     counter that starts at zero.
0142
0143   These constants passed:
0144     14 11 25 16 4 14 24
0145     12 14 25 16 4 14 24
0146   and these came close:
0147      4  8 15 26 3 22 24
0148     10  8 15 26 3 22 24
0149     11  8 15 26 3 22 24
0150   -------------------------------------------------------------------------------
0151 */
0152 #define final(a,b,c)        \
0153   {                         \
0154     c ^= b; c -= rot(b,14); \
0155     a ^= c; a -= rot(c,11); \
0156     b ^= a; b -= rot(a,25); \
0157     c ^= b; c -= rot(b,16); \
0158     a ^= c; a -= rot(c,4);  \
0159     b ^= a; b -= rot(a,14); \
0160     c ^= b; c -= rot(b,24); \
0161   }
0162
0163 /**
0164  * The follow is the documentation from Bob Jenkin's hash function:
0165  *
0166  *  http://burtleburtle.net/bob/hash/doobs.html
0167  *
0168  * The function hashlittle() has been renamed.
0169  *
0170  *  hashlittle() -- hash a variable-length key into a 32-bit value
0171  *
0172  *   k       : the key (the unaligned variable-length array of bytes)
0173  *   length  : the length of the key, counting by bytes
0174  *   initval : can be any 4-byte value
0175  *
0176  *  Returns a 32-bit value.  Every bit of the key affects every bit of the
0177  *  return value.  Two keys differing by one or two bits will have totally
0178  *  different hash values.
0179  *
0180  *  The best hash table sizes are powers of 2.  There is no need to do mod a
0181  *  prime (mod is sooo slow!).  If you need less than 32 bits, use a bitmask.
0182  *  For example, if you need only 10 bits, do h = (h & hashmask(10)); In which
0183  *  case, the hash table should have hashsize(10) elements.
0184  *
0185  *  If you are hashing n strings (uint8_t **)k, do it like this: for (i=0, h=0;
0186  *  i<n; ++i) h = hashlittle( k[i], len[i], h);
0187  *
0188  *  By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
0189  *  code any way you wish, private, educational, or commercial.  It's free.
0190  *
0191  *  Use for hash table lookup, or anything where one collision in 2^^32 is
0192  *  acceptable.  Do NOT use for cryptographic purposes.
0193 */
0194
0195 #define initval (20010928)
0196 uint32_t
0197 rtems_rfs_dir_hash (const void *key, size_t length)
0198 {
0199   uint32_t a,b,c;                             /* internal state */
0200   union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */
0201
0202   /* Set up the internal state */
0203   a = b = c = 0xdeadbeef + ((uint32_t)length) + initval;
0204
0205   u.ptr = key;
0206   if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
0207     const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
0208     /*const uint8_t  *k8;*/
0209
0210     /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
0211     while (length > 12)
0212     {
0213       a += k[0];
0214       b += k[1];
0215       c += k[2];
0216       mix(a,b,c);
0217       length -= 12;
0218       k += 3;
0219     }
0220
0221     /*----------------------------- handle the last (probably partial) block */
0222     /*
0223      * "k[2]&0xffffff" actually reads beyond the end of the string, but
0224      * then masks off the part it's not allowed to read.  Because the
0225      * string is aligned, the masked-off tail is in the same word as the
0226      * rest of the string.  Every machine with memory protection I've seen
0227      * does it on word boundaries, so is OK with this.  But VALGRIND will
0228      * still catch it and complain.  The masking trick does make the hash
0229      * noticably faster for short strings (like English words).
0230      */
0231 #ifndef VALGRIND
0232
0233     switch(length)
0234     {
0235       case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
0236       case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
0237       case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
0238       case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
0239       case 8 : b+=k[1]; a+=k[0]; break;
0240       case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
0241       case 6 : b+=k[1]&0xffff; a+=k[0]; break;
0242       case 5 : b+=k[1]&0xff; a+=k[0]; break;
0243       case 4 : a+=k[0]; break;
0244       case 3 : a+=k[0]&0xffffff; break;
0245       case 2 : a+=k[0]&0xffff; break;
0246       case 1 : a+=k[0]&0xff; break;
0247       case 0 : return c;              /* zero length strings require no mixing */
0248     }
0249
0250 #else /* make valgrind happy */
0251
0252     k8 = (const uint8_t *)k;
0253     switch(length)
0254     {
0255       case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
0256       case 11: c+=((uint32_t)k8[10])<<16;  /* fall through */
0257       case 10: c+=((uint32_t)k8[9])<<8;    /* fall through */
0258       case 9 : c+=k8[8];                   /* fall through */
0259       case 8 : b+=k[1]; a+=k[0]; break;
0260       case 7 : b+=((uint32_t)k8[6])<<16;   /* fall through */
0261       case 6 : b+=((uint32_t)k8[5])<<8;    /* fall through */
0262       case 5 : b+=k8[4];                   /* fall through */
0263       case 4 : a+=k[0]; break;
0264       case 3 : a+=((uint32_t)k8[2])<<16;   /* fall through */
0265       case 2 : a+=((uint32_t)k8[1])<<8;    /* fall through */
0266       case 1 : a+=k8[0]; break;
0267       case 0 : return c;
0268     }
0269
0270 #endif /* !valgrind */
0271
0272   } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
0273     const uint16_t *k = (const uint16_t *)key;         /* read 16-bit chunks */
0274     const uint8_t  *k8;
0275
0276     /*--------------- all but last block: aligned reads and different mixing */
0277     while (length > 12)
0278     {
0279       a += k[0] + (((uint32_t)k[1])<<16);
0280       b += k[2] + (((uint32_t)k[3])<<16);
0281       c += k[4] + (((uint32_t)k[5])<<16);
0282       mix(a,b,c);
0283       length -= 12;
0284       k += 6;
0285     }
0286
0287     /*----------------------------- handle the last (probably partial) block */
0288     k8 = (const uint8_t *)k;
0289     switch(length)
0290     {
0291       case 12: c+=k[4]+(((uint32_t)k[5])<<16);
0292         b+=k[2]+(((uint32_t)k[3])<<16);
0293         a+=k[0]+(((uint32_t)k[1])<<16);
0294         break;
0295       case 11: c+=((uint32_t)k8[10])<<16;     /* fall through */
0296       case 10: c+=k[4];
0297         b+=k[2]+(((uint32_t)k[3])<<16);
0298         a+=k[0]+(((uint32_t)k[1])<<16);
0299         break;
0300       case 9 : c+=k8[8];                      /* fall through */
0301       case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
0302         a+=k[0]+(((uint32_t)k[1])<<16);
0303         break;
0304       case 7 : b+=((uint32_t)k8[6])<<16;      /* fall through */
0305       case 6 : b+=k[2];
0306         a+=k[0]+(((uint32_t)k[1])<<16);
0307         break;
0308       case 5 : b+=k8[4];                      /* fall through */
0309       case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
0310         break;
0311       case 3 : a+=((uint32_t)k8[2])<<16;      /* fall through */
0312       case 2 : a+=k[0];
0313         break;
0314       case 1 : a+=k8[0];
0315         break;
0316       case 0 : return c;                     /* zero length requires no mixing */
0317     }
0318
0319   } else {                        /* need to read the key one byte at a time */
0320     const uint8_t *k = (const uint8_t *)key;
0321
0322     /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
0323     while (length > 12)
0324     {
0325       a += k[0];
0326       a += ((uint32_t)k[1])<<8;
0327       a += ((uint32_t)k[2])<<16;
0328       a += ((uint32_t)k[3])<<24;
0329       b += k[4];
0330       b += ((uint32_t)k[5])<<8;
0331       b += ((uint32_t)k[6])<<16;
0332       b += ((uint32_t)k[7])<<24;
0333       c += k[8];
0334       c += ((uint32_t)k[9])<<8;
0335       c += ((uint32_t)k[10])<<16;
0336       c += ((uint32_t)k[11])<<24;
0337       mix(a,b,c);
0338       length -= 12;
0339       k += 12;
0340     }
0341
0342     /*-------------------------------- last block: affect all 32 bits of (c) */
0343     switch(length)                   /* all the case statements fall through */
0344     {
0345       case 12: c+=((uint32_t)k[11])<<24;
0346       case 11: c+=((uint32_t)k[10])<<16;
0347       case 10: c+=((uint32_t)k[9])<<8;
0348       case 9 : c+=k[8];
0349       case 8 : b+=((uint32_t)k[7])<<24;
0350       case 7 : b+=((uint32_t)k[6])<<16;
0351       case 6 : b+=((uint32_t)k[5])<<8;
0352       case 5 : b+=k[4];
0353       case 4 : a+=((uint32_t)k[3])<<24;
0354       case 3 : a+=((uint32_t)k[2])<<16;
0355       case 2 : a+=((uint32_t)k[1])<<8;
0356       case 1 : a+=k[0];
0357         break;
0358       case 0 : return c;
0359     }
0360   }
0361
0362   final(a,b,c);
0363   return c;
0364 }
0365