Coverage Report

Created: 2022-07-08 09:39

/home/mdboom/Work/builds/cpython/Python/pyhash.c
Line
Count
Source (jump to first uncovered line)
1
/* Set of hash utility functions to help maintaining the invariant that
2
    if a==b then hash(a)==hash(b)
3
4
   All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5
*/
6
#include "Python.h"
7
8
#ifdef __APPLE__
9
#  include <libkern/OSByteOrder.h>
10
#elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
11
#  include <endian.h>
12
#elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
13
#  include <sys/endian.h>
14
#endif
15
16
#ifdef __cplusplus
17
extern "C" {
18
#endif
19
20
_Py_HashSecret_t _Py_HashSecret = {{0}};
21
22
#if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
23
extern PyHash_FuncDef PyHash_Func;
24
#else
25
static PyHash_FuncDef PyHash_Func;
26
#endif
27
28
/* Count _Py_HashBytes() calls */
29
#ifdef Py_HASH_STATS
30
#define Py_HASH_STATS_MAX 32
31
static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
32
#endif
33
34
/* For numeric types, the hash of a number x is based on the reduction
35
   of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
36
   hash(x) == hash(y) whenever x and y are numerically equal, even if
37
   x and y have different types.
38
39
   A quick summary of the hashing strategy:
40
41
   (1) First define the 'reduction of x modulo P' for any rational
42
   number x; this is a standard extension of the usual notion of
43
   reduction modulo P for integers.  If x == p/q (written in lowest
44
   terms), the reduction is interpreted as the reduction of p times
45
   the inverse of the reduction of q, all modulo P; if q is exactly
46
   divisible by P then define the reduction to be infinity.  So we've
47
   got a well-defined map
48
49
      reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
50
51
   (2) Now for a rational number x, define hash(x) by:
52
53
      reduce(x)   if x >= 0
54
      -reduce(-x) if x < 0
55
56
   If the result of the reduction is infinity (this is impossible for
57
   integers, floats and Decimals) then use the predefined hash value
58
   _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59
   _PyHASH_INF and -_PyHASH_INF are also used for the
60
   hashes of float and Decimal infinities.
61
62
   NaNs hash with a pointer hash.  Having distinct hash values prevents
63
   catastrophic pileups from distinct NaN instances which used to always
64
   have the same hash value but would compare unequal.
65
66
   A selling point for the above strategy is that it makes it possible
67
   to compute hashes of decimal and binary floating-point numbers
68
   efficiently, even if the exponent of the binary or decimal number
69
   is large.  The key point is that
70
71
      reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
72
73
   provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
74
   binary or decimal float is never infinity, since the denominator is a power
75
   of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
76
   for nonnegative x,
77
78
      reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
79
80
      reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
81
82
   and reduce(10**e) can be computed efficiently by the usual modular
83
   exponentiation algorithm.  For reduce(2**e) it's even better: since
84
   P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
85
   by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
86
87
   */
88
89
Py_hash_t _Py_HashPointer(const void *);
90
91
Py_hash_t
92
_Py_HashDouble(PyObject *inst, double v)
93
{
94
    int e, sign;
95
    double m;
96
    Py_uhash_t x, y;
97
98
    if (!Py_IS_FINITE(v)) {
  Branch (98:9): [True: 101, False: 45.3k]
99
        if (Py_IS_INFINITY(v))
100
            return v > 0 ? 
_PyHASH_INF37
:
-37
_PyHASH_INF37
;
  Branch (100:20): [True: 37, False: 37]
101
        else
102
            return _Py_HashPointer(inst);
103
    }
104
105
    m = frexp(v, &e);
106
107
    sign = 1;
108
    if (m < 0) {
  Branch (108:9): [True: 8.58k, False: 36.7k]
109
        sign = -1;
110
        m = -m;
111
    }
112
113
    /* process 28 bits at a time;  this should work well both for binary
114
       and hexadecimal floating point. */
115
    x = 0;
116
    while (m) {
  Branch (116:12): [True: 73.7k, False: 45.3k]
117
        x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
118
        m *= 268435456.0;  /* 2**28 */
119
        e -= 28;
120
        y = (Py_uhash_t)m;  /* pull out integer part */
121
        m -= y;
122
        x += y;
123
        if (x >= _PyHASH_MODULUS)
  Branch (123:13): [True: 0, False: 73.7k]
124
            x -= _PyHASH_MODULUS;
125
    }
126
127
    /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
128
    e = e >= 0 ? 
e % 9.93k
_PyHASH_BITS9.93k
:
_PyHASH_BITS35.3k
-1-((-1-e) % 35.3k
_PyHASH_BITS35.3k
);
  Branch (128:9): [True: 9.93k, False: 35.3k]
129
    x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
130
131
    x = x * sign;
132
    if (x == (Py_uhash_t)-1)
  Branch (132:9): [True: 153, False: 45.1k]
133
        x = (Py_uhash_t)-2;
134
    return (Py_hash_t)x;
135
}
136
137
Py_hash_t
138
_Py_HashPointerRaw(const void *p)
139
{
140
    size_t y = (size_t)p;
141
    /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
142
       excessive hash collisions for dicts and sets */
143
    y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
144
    return (Py_hash_t)y;
145
}
146
147
Py_hash_t
148
_Py_HashPointer(const void *p)
149
{
150
    Py_hash_t x = _Py_HashPointerRaw(p);
151
    if (x == -1) {
  Branch (151:9): [True: 0, False: 13.5M]
152
        x = -2;
153
    }
154
    return x;
155
}
156
157
Py_hash_t
158
_Py_HashBytes(const void *src, Py_ssize_t len)
159
{
160
    Py_hash_t x;
161
    /*
162
      We make the hash of the empty string be 0, rather than using
163
      (prefix ^ suffix), since this slightly obfuscates the hash secret
164
    */
165
    if (len == 0) {
  Branch (165:9): [True: 200, False: 15.9M]
166
        return 0;
167
    }
168
169
#ifdef Py_HASH_STATS
170
    hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
171
#endif
172
173
#if Py_HASH_CUTOFF > 0
174
    if (len < Py_HASH_CUTOFF) {
175
        /* Optimize hashing of very small strings with inline DJBX33A. */
176
        Py_uhash_t hash;
177
        const unsigned char *p = src;
178
        hash = 5381; /* DJBX33A starts with 5381 */
179
180
        switch(len) {
181
            /* ((hash << 5) + hash) + *p == hash * 33 + *p */
182
            case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
183
            case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
184
            case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
185
            case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
186
            case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
187
            case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
188
            case 1: hash = ((hash << 5) + hash) + *p++; break;
189
            default:
190
                Py_UNREACHABLE();
191
        }
192
        hash ^= len;
193
        hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
194
        x = (Py_hash_t)hash;
195
    }
196
    else
197
#endif /* Py_HASH_CUTOFF */
198
        x = PyHash_Func.hash(src, len);
199
200
    if (x == -1)
  Branch (200:9): [True: 0, False: 15.9M]
201
        return -2;
202
    return x;
203
}
204
205
void
206
_PyHash_Fini(void)
207
{
208
#ifdef Py_HASH_STATS
209
    fprintf(stderr, "len   calls    total\n");
210
    Py_ssize_t total = 0;
211
    for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
212
        total += hashstats[i];
213
        fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
214
    }
215
    total += hashstats[0];
216
    fprintf(stderr, ">  %8zd %8zd\n", hashstats[0], total);
217
#endif
218
}
219
220
PyHash_FuncDef *
221
PyHash_GetFuncDef(void)
222
{
223
    return &PyHash_Func;
224
}
225
226
/* Optimized memcpy() for Windows */
227
#ifdef _MSC_VER
228
#  if SIZEOF_PY_UHASH_T == 4
229
#    define PY_UHASH_CPY(dst, src) do {                                    \
230
       dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
231
       } while(0)
232
#  elif SIZEOF_PY_UHASH_T == 8
233
#    define PY_UHASH_CPY(dst, src) do {                                    \
234
       dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
235
       dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
236
       } while(0)
237
#  else
238
#    error SIZEOF_PY_UHASH_T must be 4 or 8
239
#  endif /* SIZEOF_PY_UHASH_T */
240
#else /* not Windows */
241
#  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
242
#endif /* _MSC_VER */
243
244
245
#if Py_HASH_ALGORITHM == Py_HASH_FNV
246
/* **************************************************************************
247
 * Modified Fowler-Noll-Vo (FNV) hash function
248
 */
249
static Py_hash_t
250
fnv(const void *src, Py_ssize_t len)
251
{
252
    const unsigned char *p = src;
253
    Py_uhash_t x;
254
    Py_ssize_t remainder, blocks;
255
    union {
256
        Py_uhash_t value;
257
        unsigned char bytes[SIZEOF_PY_UHASH_T];
258
    } block;
259
260
#ifdef Py_DEBUG
261
    assert(_Py_HashSecret_Initialized);
262
#endif
263
    remainder = len % SIZEOF_PY_UHASH_T;
264
    if (remainder == 0) {
265
        /* Process at least one block byte by byte to reduce hash collisions
266
         * for strings with common prefixes. */
267
        remainder = SIZEOF_PY_UHASH_T;
268
    }
269
    blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
270
271
    x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
272
    x ^= (Py_uhash_t) *p << 7;
273
    while (blocks--) {
274
        PY_UHASH_CPY(block.bytes, p);
275
        x = (_PyHASH_MULTIPLIER * x) ^ block.value;
276
        p += SIZEOF_PY_UHASH_T;
277
    }
278
    /* add remainder */
279
    for (; remainder > 0; remainder--)
280
        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
281
    x ^= (Py_uhash_t) len;
282
    x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
283
    if (x == (Py_uhash_t) -1) {
284
        x = (Py_uhash_t) -2;
285
    }
286
    return x;
287
}
288
289
static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
290
                                     16 * SIZEOF_PY_HASH_T};
291
292
#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
293
294
295
/* **************************************************************************
296
 <MIT License>
297
 Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
298
299
 Permission is hereby granted, free of charge, to any person obtaining a copy
300
 of this software and associated documentation files (the "Software"), to deal
301
 in the Software without restriction, including without limitation the rights
302
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
303
 copies of the Software, and to permit persons to whom the Software is
304
 furnished to do so, subject to the following conditions:
305
306
 The above copyright notice and this permission notice shall be included in
307
 all copies or substantial portions of the Software.
308
309
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
310
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
311
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
312
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
313
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
314
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
315
 THE SOFTWARE.
316
 </MIT License>
317
318
 Original location:
319
    https://github.com/majek/csiphash/
320
321
 Solution inspired by code from:
322
    Samuel Neves (supercop/crypto_auth/siphash24/little)
323
    djb (supercop/crypto_auth/siphash24/little2)
324
    Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
325
326
 Modified for Python by Christian Heimes:
327
    - C89 / MSVC compatibility
328
    - _rotl64() on Windows
329
    - letoh64() fallback
330
*/
331
332
/* byte swap little endian to host endian
333
 * Endian conversion not only ensures that the hash function returns the same
334
 * value on all platforms. It is also required to for a good dispersion of
335
 * the hash values' least significant bits.
336
 */
337
#if PY_LITTLE_ENDIAN
338
#  define _le64toh(x) ((uint64_t)(x))
339
#elif defined(__APPLE__)
340
#  define _le64toh(x) OSSwapLittleToHostInt64(x)
341
#elif defined(HAVE_LETOH64)
342
#  define _le64toh(x) le64toh(x)
343
#else
344
#  define _le64toh(x) (((uint64_t)(x) << 56) | \
345
                      (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
346
                      (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
347
                      (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
348
                      (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
349
                      (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
350
                      (((uint64_t)(x) >> 40) & 0xff00ULL) | \
351
                      ((uint64_t)(x)  >> 56))
352
#endif
353
354
355
#ifdef _MSC_VER
356
#  define ROTATE(x, b)  _rotl64(x, b)
357
#else
358
#  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
359
#endif
360
361
#define HALF_ROUND(a,b,c,d,s,t)     \
362
    a += b; c += d;                 \
363
    b = ROTATE(b, s) ^ a;           \
364
    d = ROTATE(d, t) ^ c;           \
365
    a = ROTATE(a, 32);
366
367
#define SINGLE_ROUND(v0,v1,v2,v3)   \
368
    HALF_ROUND(v0,v1,v2,v3,13,16);  \
369
    HALF_ROUND(v2,v1,v0,v3,17,21);
370
371
#define DOUBLE_ROUND(v0,v1,v2,v3)   \
372
    SINGLE_ROUND(v0,v1,v2,v3);      \
373
    SINGLE_ROUND(v0,v1,v2,v3);
374
375
376
static uint64_t
377
siphash13(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
378
    uint64_t b = (uint64_t)src_sz << 56;
379
    const uint8_t *in = (const uint8_t*)src;
380
381
    uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
382
    uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
383
    uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
384
    uint64_t v3 = k1 ^ 0x7465646279746573ULL;
385
386
    uint64_t t;
387
    uint8_t *pt;
388
389
    while (src_sz >= 8) {
  Branch (389:12): [True: 36.3M, False: 15.9M]
390
        uint64_t mi;
391
        memcpy(&mi, in, sizeof(mi));
392
        mi = _le64toh(mi);
393
        in += sizeof(mi);
394
        src_sz -= sizeof(mi);
395
        v3 ^= mi;
396
        SINGLE_ROUND(v0,v1,v2,v3);
397
        v0 ^= mi;
398
    }
399
400
    t = 0;
401
    pt = (uint8_t *)&t;
402
    switch (src_sz) {
  Branch (402:13): [True: 972k, False: 14.9M]
403
        case 7: pt[6] = in[6]; /* fall through */
  Branch (403:9): [True: 1.04M, False: 14.8M]
404
        case 6: pt[5] = in[5]; /* fall through */
  Branch (404:9): [True: 3.28M, False: 12.6M]
405
        case 5: pt[4] = in[4]; /* fall through */
  Branch (405:9): [True: 1.15M, False: 14.7M]
406
        case 4: memcpy(pt, in, sizeof(uint32_t)); break;
  Branch (406:9): [True: 1.91M, False: 13.9M]
407
        case 3: pt[2] = in[2]; /* fall through */
  Branch (407:9): [True: 3.80M, False: 12.0M]
408
        case 2: pt[1] = in[1]; /* fall through */
  Branch (408:9): [True: 2.90M, False: 12.9M]
409
        case 1: pt[0] = in[0]; /* fall through */
  Branch (409:9): [True: 829k, False: 15.0M]
410
    }
411
    b |= _le64toh(t);
412
413
    v3 ^= b;
414
    SINGLE_ROUND(v0,v1,v2,v3);
415
    v0 ^= b;
416
    v2 ^= 0xff;
417
    SINGLE_ROUND(v0,v1,v2,v3);
418
    SINGLE_ROUND(v0,v1,v2,v3);
419
    SINGLE_ROUND(v0,v1,v2,v3);
420
421
    /* modified */
422
    t = (v0 ^ v1) ^ (v2 ^ v3);
423
    return t;
424
}
425
426
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
427
static uint64_t
428
siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
429
    uint64_t b = (uint64_t)src_sz << 56;
430
    const uint8_t *in = (const uint8_t*)src;
431
432
    uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
433
    uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
434
    uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
435
    uint64_t v3 = k1 ^ 0x7465646279746573ULL;
436
437
    uint64_t t;
438
    uint8_t *pt;
439
440
    while (src_sz >= 8) {
441
        uint64_t mi;
442
        memcpy(&mi, in, sizeof(mi));
443
        mi = _le64toh(mi);
444
        in += sizeof(mi);
445
        src_sz -= sizeof(mi);
446
        v3 ^= mi;
447
        DOUBLE_ROUND(v0,v1,v2,v3);
448
        v0 ^= mi;
449
    }
450
451
    t = 0;
452
    pt = (uint8_t *)&t;
453
    switch (src_sz) {
454
        case 7: pt[6] = in[6]; /* fall through */
455
        case 6: pt[5] = in[5]; /* fall through */
456
        case 5: pt[4] = in[4]; /* fall through */
457
        case 4: memcpy(pt, in, sizeof(uint32_t)); break;
458
        case 3: pt[2] = in[2]; /* fall through */
459
        case 2: pt[1] = in[1]; /* fall through */
460
        case 1: pt[0] = in[0]; /* fall through */
461
    }
462
    b |= _le64toh(t);
463
464
    v3 ^= b;
465
    DOUBLE_ROUND(v0,v1,v2,v3);
466
    v0 ^= b;
467
    v2 ^= 0xff;
468
    DOUBLE_ROUND(v0,v1,v2,v3);
469
    DOUBLE_ROUND(v0,v1,v2,v3);
470
471
    /* modified */
472
    t = (v0 ^ v1) ^ (v2 ^ v3);
473
    return t;
474
}
475
#endif
476
477
uint64_t
478
_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
479
{
480
    return siphash13(key, 0, src, src_sz);
481
}
482
483
484
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH13
485
static Py_hash_t
486
pysiphash(const void *src, Py_ssize_t src_sz) {
487
    return (Py_hash_t)siphash13(
488
        _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
489
        src, src_sz);
490
}
491
492
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash13", 64, 128};
493
#endif
494
495
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
496
static Py_hash_t
497
pysiphash(const void *src, Py_ssize_t src_sz) {
498
    return (Py_hash_t)siphash24(
499
        _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
500
        src, src_sz);
501
}
502
503
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
504
#endif
505
506
#ifdef __cplusplus
507
}
508
#endif