Line data Source code
1 : /* Set of hash utility functions to help maintaining the invariant that
2 : if a==b then hash(a)==hash(b)
3 :
4 : All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5 : */
6 : #include "Python.h"
7 :
8 : #ifdef __APPLE__
9 : # include <libkern/OSByteOrder.h>
10 : #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
11 : # include <endian.h>
12 : #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
13 : # include <sys/endian.h>
14 : #endif
15 :
16 : #ifdef __cplusplus
17 : extern "C" {
18 : #endif
19 :
20 : _Py_HashSecret_t _Py_HashSecret = {{0}};
21 :
22 : #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
23 : extern PyHash_FuncDef PyHash_Func;
24 : #else
25 : static PyHash_FuncDef PyHash_Func;
26 : #endif
27 :
28 : /* Count _Py_HashBytes() calls */
29 : #ifdef Py_HASH_STATS
30 : #define Py_HASH_STATS_MAX 32
31 : static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
32 : #endif
33 :
34 : /* For numeric types, the hash of a number x is based on the reduction
35 : of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
36 : hash(x) == hash(y) whenever x and y are numerically equal, even if
37 : x and y have different types.
38 :
39 : A quick summary of the hashing strategy:
40 :
41 : (1) First define the 'reduction of x modulo P' for any rational
42 : number x; this is a standard extension of the usual notion of
43 : reduction modulo P for integers. If x == p/q (written in lowest
44 : terms), the reduction is interpreted as the reduction of p times
45 : the inverse of the reduction of q, all modulo P; if q is exactly
46 : divisible by P then define the reduction to be infinity. So we've
47 : got a well-defined map
48 :
49 : reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
50 :
51 : (2) Now for a rational number x, define hash(x) by:
52 :
53 : reduce(x) if x >= 0
54 : -reduce(-x) if x < 0
55 :
56 : If the result of the reduction is infinity (this is impossible for
57 : integers, floats and Decimals) then use the predefined hash value
58 : _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59 : _PyHASH_INF and -_PyHASH_INF are also used for the
60 : hashes of float and Decimal infinities.
61 :
62 : NaNs hash with a pointer hash. Having distinct hash values prevents
63 : catastrophic pileups from distinct NaN instances which used to always
64 : have the same hash value but would compare unequal.
65 :
66 : A selling point for the above strategy is that it makes it possible
67 : to compute hashes of decimal and binary floating-point numbers
68 : efficiently, even if the exponent of the binary or decimal number
69 : is large. The key point is that
70 :
71 : reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
72 :
73 : provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
74 : binary or decimal float is never infinity, since the denominator is a power
75 : of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
76 : for nonnegative x,
77 :
78 : reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
79 :
80 : reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
81 :
82 : and reduce(10**e) can be computed efficiently by the usual modular
83 : exponentiation algorithm. For reduce(2**e) it's even better: since
84 : P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
85 : by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
86 :
87 : */
88 :
89 : Py_hash_t _Py_HashPointer(const void *);
90 :
91 : Py_hash_t
92 197666 : _Py_HashDouble(PyObject *inst, double v)
93 : {
94 : int e, sign;
95 : double m;
96 : Py_uhash_t x, y;
97 :
98 197666 : if (!Py_IS_FINITE(v)) {
99 306 : if (Py_IS_INFINITY(v))
100 215 : return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
101 : else
102 91 : return _Py_HashPointer(inst);
103 : }
104 :
105 197360 : m = frexp(v, &e);
106 :
107 197360 : sign = 1;
108 197360 : if (m < 0) {
109 21810 : sign = -1;
110 21810 : m = -m;
111 : }
112 :
113 : /* process 28 bits at a time; this should work well both for binary
114 : and hexadecimal floating point. */
115 197360 : x = 0;
116 458051 : while (m) {
117 260691 : x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
118 260691 : m *= 268435456.0; /* 2**28 */
119 260691 : e -= 28;
120 260691 : y = (Py_uhash_t)m; /* pull out integer part */
121 260691 : m -= y;
122 260691 : x += y;
123 260691 : if (x >= _PyHASH_MODULUS)
124 0 : x -= _PyHASH_MODULUS;
125 : }
126 :
127 : /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
128 197360 : e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
129 197360 : x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
130 :
131 197360 : x = x * sign;
132 197360 : if (x == (Py_uhash_t)-1)
133 1664 : x = (Py_uhash_t)-2;
134 197360 : return (Py_hash_t)x;
135 : }
136 :
137 : Py_hash_t
138 41366900 : _Py_HashPointerRaw(const void *p)
139 : {
140 41366900 : size_t y = (size_t)p;
141 : /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
142 : excessive hash collisions for dicts and sets */
143 41366900 : y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
144 41366900 : return (Py_hash_t)y;
145 : }
146 :
147 : Py_hash_t
148 23803100 : _Py_HashPointer(const void *p)
149 : {
150 23803100 : Py_hash_t x = _Py_HashPointerRaw(p);
151 23803100 : if (x == -1) {
152 0 : x = -2;
153 : }
154 23803100 : return x;
155 : }
156 :
157 : Py_hash_t
158 125786000 : _Py_HashBytes(const void *src, Py_ssize_t len)
159 : {
160 : Py_hash_t x;
161 : /*
162 : We make the hash of the empty string be 0, rather than using
163 : (prefix ^ suffix), since this slightly obfuscates the hash secret
164 : */
165 125786000 : if (len == 0) {
166 5821 : return 0;
167 : }
168 :
169 : #ifdef Py_HASH_STATS
170 : hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
171 : #endif
172 :
173 : #if Py_HASH_CUTOFF > 0
174 : if (len < Py_HASH_CUTOFF) {
175 : /* Optimize hashing of very small strings with inline DJBX33A. */
176 : Py_uhash_t hash;
177 : const unsigned char *p = src;
178 : hash = 5381; /* DJBX33A starts with 5381 */
179 :
180 : switch(len) {
181 : /* ((hash << 5) + hash) + *p == hash * 33 + *p */
182 : case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
183 : case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
184 : case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
185 : case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
186 : case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
187 : case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
188 : case 1: hash = ((hash << 5) + hash) + *p++; break;
189 : default:
190 : Py_UNREACHABLE();
191 : }
192 : hash ^= len;
193 : hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
194 : x = (Py_hash_t)hash;
195 : }
196 : else
197 : #endif /* Py_HASH_CUTOFF */
198 125780000 : x = PyHash_Func.hash(src, len);
199 :
200 125780000 : if (x == -1)
201 0 : return -2;
202 125780000 : return x;
203 : }
204 :
205 : void
206 2952 : _PyHash_Fini(void)
207 : {
208 : #ifdef Py_HASH_STATS
209 : fprintf(stderr, "len calls total\n");
210 : Py_ssize_t total = 0;
211 : for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
212 : total += hashstats[i];
213 : fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
214 : }
215 : total += hashstats[0];
216 : fprintf(stderr, "> %8zd %8zd\n", hashstats[0], total);
217 : #endif
218 2952 : }
219 :
220 : PyHash_FuncDef *
221 3134 : PyHash_GetFuncDef(void)
222 : {
223 3134 : return &PyHash_Func;
224 : }
225 :
226 : /* Optimized memcpy() for Windows */
227 : #ifdef _MSC_VER
228 : # if SIZEOF_PY_UHASH_T == 4
229 : # define PY_UHASH_CPY(dst, src) do { \
230 : dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
231 : } while(0)
232 : # elif SIZEOF_PY_UHASH_T == 8
233 : # define PY_UHASH_CPY(dst, src) do { \
234 : dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
235 : dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
236 : } while(0)
237 : # else
238 : # error SIZEOF_PY_UHASH_T must be 4 or 8
239 : # endif /* SIZEOF_PY_UHASH_T */
240 : #else /* not Windows */
241 : # define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
242 : #endif /* _MSC_VER */
243 :
244 :
245 : #if Py_HASH_ALGORITHM == Py_HASH_FNV
246 : /* **************************************************************************
247 : * Modified Fowler-Noll-Vo (FNV) hash function
248 : */
249 : static Py_hash_t
250 : fnv(const void *src, Py_ssize_t len)
251 : {
252 : const unsigned char *p = src;
253 : Py_uhash_t x;
254 : Py_ssize_t remainder, blocks;
255 : union {
256 : Py_uhash_t value;
257 : unsigned char bytes[SIZEOF_PY_UHASH_T];
258 : } block;
259 :
260 : #ifdef Py_DEBUG
261 : assert(_Py_HashSecret_Initialized);
262 : #endif
263 : remainder = len % SIZEOF_PY_UHASH_T;
264 : if (remainder == 0) {
265 : /* Process at least one block byte by byte to reduce hash collisions
266 : * for strings with common prefixes. */
267 : remainder = SIZEOF_PY_UHASH_T;
268 : }
269 : blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
270 :
271 : x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
272 : x ^= (Py_uhash_t) *p << 7;
273 : while (blocks--) {
274 : PY_UHASH_CPY(block.bytes, p);
275 : x = (_PyHASH_MULTIPLIER * x) ^ block.value;
276 : p += SIZEOF_PY_UHASH_T;
277 : }
278 : /* add remainder */
279 : for (; remainder > 0; remainder--)
280 : x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
281 : x ^= (Py_uhash_t) len;
282 : x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
283 : if (x == (Py_uhash_t) -1) {
284 : x = (Py_uhash_t) -2;
285 : }
286 : return x;
287 : }
288 :
289 : static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
290 : 16 * SIZEOF_PY_HASH_T};
291 :
292 : #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
293 :
294 :
295 : /* **************************************************************************
296 : <MIT License>
297 : Copyright (c) 2013 Marek Majkowski <marek@popcount.org>
298 :
299 : Permission is hereby granted, free of charge, to any person obtaining a copy
300 : of this software and associated documentation files (the "Software"), to deal
301 : in the Software without restriction, including without limitation the rights
302 : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
303 : copies of the Software, and to permit persons to whom the Software is
304 : furnished to do so, subject to the following conditions:
305 :
306 : The above copyright notice and this permission notice shall be included in
307 : all copies or substantial portions of the Software.
308 :
309 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
310 : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
311 : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
312 : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
313 : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
314 : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
315 : THE SOFTWARE.
316 : </MIT License>
317 :
318 : Original location:
319 : https://github.com/majek/csiphash/
320 :
321 : Solution inspired by code from:
322 : Samuel Neves (supercop/crypto_auth/siphash24/little)
323 : djb (supercop/crypto_auth/siphash24/little2)
324 : Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
325 :
326 : Modified for Python by Christian Heimes:
327 : - C89 / MSVC compatibility
328 : - _rotl64() on Windows
329 : - letoh64() fallback
330 : */
331 :
332 : /* byte swap little endian to host endian
333 : * Endian conversion not only ensures that the hash function returns the same
334 : * value on all platforms. It is also required to for a good dispersion of
335 : * the hash values' least significant bits.
336 : */
337 : #if PY_LITTLE_ENDIAN
338 : # define _le64toh(x) ((uint64_t)(x))
339 : #elif defined(__APPLE__)
340 : # define _le64toh(x) OSSwapLittleToHostInt64(x)
341 : #elif defined(HAVE_LETOH64)
342 : # define _le64toh(x) le64toh(x)
343 : #else
344 : # define _le64toh(x) (((uint64_t)(x) << 56) | \
345 : (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
346 : (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
347 : (((uint64_t)(x) << 8) & 0xff00000000ULL) | \
348 : (((uint64_t)(x) >> 8) & 0xff000000ULL) | \
349 : (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
350 : (((uint64_t)(x) >> 40) & 0xff00ULL) | \
351 : ((uint64_t)(x) >> 56))
352 : #endif
353 :
354 :
355 : #ifdef _MSC_VER
356 : # define ROTATE(x, b) _rotl64(x, b)
357 : #else
358 : # define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
359 : #endif
360 :
361 : #define HALF_ROUND(a,b,c,d,s,t) \
362 : a += b; c += d; \
363 : b = ROTATE(b, s) ^ a; \
364 : d = ROTATE(d, t) ^ c; \
365 : a = ROTATE(a, 32);
366 :
367 : #define SINGLE_ROUND(v0,v1,v2,v3) \
368 : HALF_ROUND(v0,v1,v2,v3,13,16); \
369 : HALF_ROUND(v2,v1,v0,v3,17,21);
370 :
371 : #define DOUBLE_ROUND(v0,v1,v2,v3) \
372 : SINGLE_ROUND(v0,v1,v2,v3); \
373 : SINGLE_ROUND(v0,v1,v2,v3);
374 :
375 :
376 : static uint64_t
377 125781000 : siphash13(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
378 125781000 : uint64_t b = (uint64_t)src_sz << 56;
379 125781000 : const uint8_t *in = (const uint8_t*)src;
380 :
381 125781000 : uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
382 125781000 : uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
383 125781000 : uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
384 125781000 : uint64_t v3 = k1 ^ 0x7465646279746573ULL;
385 :
386 : uint64_t t;
387 : uint8_t *pt;
388 :
389 251324000 : while (src_sz >= 8) {
390 : uint64_t mi;
391 125543000 : memcpy(&mi, in, sizeof(mi));
392 125543000 : mi = _le64toh(mi);
393 125543000 : in += sizeof(mi);
394 125543000 : src_sz -= sizeof(mi);
395 125543000 : v3 ^= mi;
396 125543000 : SINGLE_ROUND(v0,v1,v2,v3);
397 125543000 : v0 ^= mi;
398 : }
399 :
400 125781000 : t = 0;
401 125781000 : pt = (uint8_t *)&t;
402 125781000 : switch (src_sz) {
403 17300300 : case 7: pt[6] = in[6]; /* fall through */
404 35429700 : case 6: pt[5] = in[5]; /* fall through */
405 48131600 : case 5: pt[4] = in[4]; /* fall through */
406 70812900 : case 4: memcpy(pt, in, sizeof(uint32_t)); break;
407 17574600 : case 3: pt[2] = in[2]; /* fall through */
408 32495900 : case 2: pt[1] = in[1]; /* fall through */
409 41308300 : case 1: pt[0] = in[0]; /* fall through */
410 : }
411 125781000 : b |= _le64toh(t);
412 :
413 125781000 : v3 ^= b;
414 125781000 : SINGLE_ROUND(v0,v1,v2,v3);
415 125781000 : v0 ^= b;
416 125781000 : v2 ^= 0xff;
417 125781000 : SINGLE_ROUND(v0,v1,v2,v3);
418 125781000 : SINGLE_ROUND(v0,v1,v2,v3);
419 125781000 : SINGLE_ROUND(v0,v1,v2,v3);
420 :
421 : /* modified */
422 125781000 : t = (v0 ^ v1) ^ (v2 ^ v3);
423 125781000 : return t;
424 : }
425 :
426 : #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
427 : static uint64_t
428 : siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
429 : uint64_t b = (uint64_t)src_sz << 56;
430 : const uint8_t *in = (const uint8_t*)src;
431 :
432 : uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
433 : uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
434 : uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
435 : uint64_t v3 = k1 ^ 0x7465646279746573ULL;
436 :
437 : uint64_t t;
438 : uint8_t *pt;
439 :
440 : while (src_sz >= 8) {
441 : uint64_t mi;
442 : memcpy(&mi, in, sizeof(mi));
443 : mi = _le64toh(mi);
444 : in += sizeof(mi);
445 : src_sz -= sizeof(mi);
446 : v3 ^= mi;
447 : DOUBLE_ROUND(v0,v1,v2,v3);
448 : v0 ^= mi;
449 : }
450 :
451 : t = 0;
452 : pt = (uint8_t *)&t;
453 : switch (src_sz) {
454 : case 7: pt[6] = in[6]; /* fall through */
455 : case 6: pt[5] = in[5]; /* fall through */
456 : case 5: pt[4] = in[4]; /* fall through */
457 : case 4: memcpy(pt, in, sizeof(uint32_t)); break;
458 : case 3: pt[2] = in[2]; /* fall through */
459 : case 2: pt[1] = in[1]; /* fall through */
460 : case 1: pt[0] = in[0]; /* fall through */
461 : }
462 : b |= _le64toh(t);
463 :
464 : v3 ^= b;
465 : DOUBLE_ROUND(v0,v1,v2,v3);
466 : v0 ^= b;
467 : v2 ^= 0xff;
468 : DOUBLE_ROUND(v0,v1,v2,v3);
469 : DOUBLE_ROUND(v0,v1,v2,v3);
470 :
471 : /* modified */
472 : t = (v0 ^ v1) ^ (v2 ^ v3);
473 : return t;
474 : }
475 : #endif
476 :
477 : uint64_t
478 665 : _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
479 : {
480 665 : return siphash13(key, 0, src, src_sz);
481 : }
482 :
483 :
484 : #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH13
485 : static Py_hash_t
486 125780000 : pysiphash(const void *src, Py_ssize_t src_sz) {
487 125780000 : return (Py_hash_t)siphash13(
488 125780000 : _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
489 : src, src_sz);
490 : }
491 :
492 : static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash13", 64, 128};
493 : #endif
494 :
495 : #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
496 : static Py_hash_t
497 : pysiphash(const void *src, Py_ssize_t src_sz) {
498 : return (Py_hash_t)siphash24(
499 : _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
500 : src, src_sz);
501 : }
502 :
503 : static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
504 : #endif
505 :
506 : #ifdef __cplusplus
507 : }
508 : #endif
|