Issue #24762: Speed-up frozenset_hash() and greatly beef-up the comments.

This commit is contained in:
Raymond Hettinger 2015-08-01 09:53:00 -07:00
parent 99b80b5072
commit fbffdef47d
1 changed files with 43 additions and 27 deletions

View File

@ -739,41 +739,57 @@ set_traverse(PySetObject *so, visitproc visit, void *arg)
return 0; return 0;
} }
/* Work to increase the bit dispersion for closely spaced hash values.
This is important because some use cases have many combinations of a
small number of elements with nearby hashes so that many distinct
combinations collapse to only a handful of distinct hash values. */
static Py_uhash_t
_shuffle_bits(Py_uhash_t h)
{
return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
}
/* Most of the constants in this hash algorithm are randomly chosen
large primes with "interesting bit patterns" and that passed tests
for good collision statistics on a variety of problematic datasets
including powersets and graph structures (such as David Eppstein's
graph recipes in Lib/test/test_set.py) */
static Py_hash_t static Py_hash_t
frozenset_hash(PyObject *self) frozenset_hash(PyObject *self)
{ {
/* Most of the constants in this hash algorithm are randomly choosen
large primes with "interesting bit patterns" and that passed
tests for good collision statistics on a variety of problematic
datasets such as:
ps = []
for r in range(21):
ps += itertools.combinations(range(20), r)
num_distinct_hashes = len({hash(frozenset(s)) for s in ps})
*/
PySetObject *so = (PySetObject *)self; PySetObject *so = (PySetObject *)self;
Py_uhash_t h, hash = 1927868237UL; Py_uhash_t hash = 1927868237UL;
setentry *entry; setentry *entry;
Py_ssize_t pos = 0;
if (so->hash != -1)
return so->hash;
/* Make hash(frozenset({0})) distinct from hash(frozenset()) */
hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1; hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1;
while (set_next(so, &pos, &entry)) {
/* Work to increase the bit dispersion for closely spaced hash /* Xor-in shuffled bits from every entry's hash field because xor is
values. This is important because some use cases have many commutative and a frozenset hash should be independent of order.
combinations of a small number of elements with nearby
hashes so that many distinct combinations collapse to only For speed, include null entries and dummy entries and then
a handful of distinct hash values. */ subtract out their effect afterwards so that the final hash
h = entry->hash; depends only on active entries. This allows the code to be
hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL; vectorized by the compiler and it saves the unpredictable
} branches that would arise when trying to exclude null and dummy
/* Make the final result spread-out in a different pattern entries on every iteration. */
than the algorithm for tuples or other python objects. */
for (entry = so->table; entry <= &so->table[so->mask]; entry++)
hash ^= _shuffle_bits(entry->hash);
/* Remove the effect of an odd number NULL entries */
if ((so->mask + 1 - so->fill) & 1)
hash ^= _shuffle_bits(0);
/* Remove the effect of an odd number of dummy entries */
if ((so->fill - so->used) & 1)
hash ^= _shuffle_bits(-1);
/* Disperse patterns arising in nested frozensets */
hash = hash * 69069U + 907133923UL; hash = hash * 69069U + 907133923UL;
if (hash == (Py_uhash_t)-1) if (hash == (Py_uhash_t)-1)
hash = 590923713UL; hash = 590923713UL;
so->hash = hash; so->hash = hash;