diff options
author | Shawn O. Pearce <spearce@spearce.org> | 2010-09-23 17:13:20 -0700 |
---|---|---|
committer | Shawn O. Pearce <spearce@spearce.org> | 2010-09-23 17:44:53 -0700 |
commit | 11f99fecfd4b839d258a77092e06daedd5660821 (patch) | |
tree | 2ba03c4b49691ec0bf31629280b19406387e4f6a /org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java | |
parent | 4447d76a419926aa9f0640b7a045d8149350c506 (diff) | |
download | jgit-11f99fecfd4b839d258a77092e06daedd5660821.tar.gz jgit-11f99fecfd4b839d258a77092e06daedd5660821.zip |
Reduce content hash function collisions
The hash code returned by RawTextComparator (or that is used
by the SimilarityIndex) play an important role in the speed of
any algorithm that is based upon them. The lower the number of
collisions produced by the hash function, the shorter the hash
chains within hash tables will be, and the less likely we are to
fall into O(N^2) runtime behaviors for algorithms like PatienceDiff.
Our prior hash function was absolutely horrid, so replace it with
the proper definition of the DJB hash that was originally published
by Professor Daniel J. Bernstein.
To support this assertion, below is a table listing the maximum
number of collisions that result when hashing the unique lines in
each source code file of 3 randomly chosen projects:
test_jgit: 931 files; 122 avg. unique lines/file
Algorithm | Collisions
-------------+-----------
prior_hash 418
djb 5
sha1 6
string_hash31 11
test_linux26: 30198 files; 258 avg. unique lines/file
Algorithm | Collisions
-------------+-----------
prior_hash 8675
djb 32
sha1 8
string_hash31 32
test_frameworks_base: 8381 files; 184 avg. unique lines/file
Algorithm | Collisions
-------------+-----------
prior_hash 4615
djb 10
sha1 6
string_hash31 13
We can clearly see that prior_hash performed very poorly, resulting
in 8,675 collisions (elements in the same hash bucket) for at least
one file in the Linux kernel repository. This leads to some very
bad O(N) style insertion and lookup performance, even though the
hash table was sized to be the next power-of-2 larger than the
total number of unique lines in the file.
The djb hash we are replacing prior_hash with performs closer to
SHA-1 in terms of having very few collisions. This indicates it
provides a reasonably distributed output for this type of input,
despite being a much simpler algorithm (and therefore will be much
faster to execute).
The string_hash31 function is provided just to compare results with,
it is the algorithm commonly used by java.lang.String hashCode().
However, life isn't quite this simple.
djb produces a 32 bit hash code, but our hash tables are always
smaller than 2^32 buckets. Mashing the 32 bit code into an array
index used to be done by simply taking the lower bits of the hash
code by a bitwise and operator. This unfortuntely still produces
many collisions, e.g. 32 on the linux-2.6 repository files.
From [1] we can apply a final "cleanup" step to the hash code to
mix the bits together a little better, and give priority to the
higher order bits as they include data from more bytes of input:
test_jgit: 931 files; 122 avg. unique lines/file
Algorithm | Collisions
-------------+-----------
prior_hash 418
djb 5
djb + cleanup 6
test_linux26: 30198 files; 258 avg. unique lines/file
Algorithm | Collisions
-------------+-----------
prior_hash 8675
djb 32
djb + cleanup 7
test_frameworks_base: 8381 files; 184 avg. unique lines/file
Algorithm | Collisions
-------------+-----------
prior_hash 4615
djb 10
djb + cleanup 7
This is a massive improvement, as the number of collisions for
common inputs drops to acceptable levels, and we haven't really
made the hash functions any more complex than they were before.
[1] http://lkml.org/lkml/2009/10/27/404
Change-Id: Ia753b695de9526a157ddba265824240bd05dead1
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
Diffstat (limited to 'org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java')
-rw-r--r-- | org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java | 44 |
1 files changed, 19 insertions, 25 deletions
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java index 39bcebb4c3..6627268e48 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java @@ -68,20 +68,13 @@ class SimilarityIndex { /** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */ private static final int MAX_HASH_BITS = 17; - /** The {@link #idHash} table will not grow bigger than this, ever. */ - private static final int MAX_HASH_SIZE = 1 << MAX_HASH_BITS; - - /** Prime just before {@link #MAX_HASH_SIZE}. */ - private static final int P = 131071; - /** * Shift to apply before storing a key. * <p> * Within the 64 bit table record space, we leave the highest bit unset so - * all values are positive, and we need {@link #MAX_HASH_BITS} bits for the - * keys. The lower 32 bits are used to count bytes impacted. + * all values are positive. The lower 32 bits to count bytes. */ - private static final int KEY_SHIFT = 64 - 1 - MAX_HASH_BITS; + private static final int KEY_SHIFT = 32; /** Total size of the file we hashed into the structure. */ private long fileSize; @@ -100,8 +93,12 @@ class SimilarityIndex { */ private long[] idHash; + /** {@code idHash.length == 1 << idHashBits}. */ + private int idHashBits; + SimilarityIndex() { - idHash = new long[256]; + idHashBits = 8; + idHash = new long[1 << idHashBits]; } long getFileSize() { @@ -138,7 +135,7 @@ class SimilarityIndex { int c = raw[ptr++] & 0xff; if (c == '\n') break; - hash = (hash << 5) ^ c; + hash = (hash << 5) + hash + c; } while (ptr < end && ptr - start < 64); add(hash, ptr - start); } @@ -166,7 +163,7 @@ class SimilarityIndex { int c = buf[ptr++] & 0xff; if (c == '\n') break; - hash = (hash << 5) ^ c; + hash = (hash << 5) + hash + c; } while (n < 64 && n < remaining); add(hash, n); remaining -= n; @@ -272,7 +269,8 @@ class SimilarityIndex { } void add(int key, int cnt) { - key = hash(key); + key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative. + int j = slot(key); for (;;) { long v = idHash[j]; @@ -298,28 +296,24 @@ class SimilarityIndex { } } - private static int hash(int key) { - // Make the key fit into our table. Since we have a maximum size - // that we cap the table at, all keys get squashed before going - // into the table. This prevents overflow. - // - return (key >>> 1) % P; - } - private int slot(int key) { - return key % idHash.length; + // We use 31 - idHashBits because the upper bit was already forced + // to be 0 and we want the remaining high bits to be used as the + // table slot. + // + return key >>> (31 - idHashBits); } private boolean shouldGrow() { - int n = idHash.length; - return n < MAX_HASH_SIZE && n <= idSize * 2; + return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2; } private void grow() { long[] oldHash = idHash; int oldSize = idHash.length; - idHash = new long[2 * oldSize]; + idHashBits++; + idHash = new long[1 << idHashBits]; for (int i = 0; i < oldSize; i++) { long v = oldHash[i]; if (v != 0) { |