diff options
author | Shawn O. Pearce <spearce@spearce.org> | 2010-11-11 14:10:32 -0800 |
---|---|---|
committer | Shawn O. Pearce <spearce@spearce.org> | 2010-11-12 11:56:59 -0800 |
commit | d63887127e20c0a70c53c48a9aa5ffbdb1cf8873 (patch) | |
tree | d684d159514531211193a5e84963ef21b45db001 /org.eclipse.jgit | |
parent | f3b511568b5f6af219bd9c948092bbec46098484 (diff) | |
download | jgit-d63887127e20c0a70c53c48a9aa5ffbdb1cf8873.tar.gz jgit-d63887127e20c0a70c53c48a9aa5ffbdb1cf8873.zip |
SimilarityIndex: Accept files larger than 8 MB
Files bigger than 8 MB (2^23 bytes) tended to overflow the internal
hashtable, as the table was capped in size to 2^17 records. If a
file contained 2^17 unique data blocks/lines, the table insertion
got stuck in an infinite loop as the able couldn't grow, and there
was no open slot for the new item.
Remove the artifical 2^17 table limit and instead allow the table
to grow to be as big as 2^30. With a 64 byte block size, this
permits hashing inputs as large as 64 GB.
If the table reaches 2^30 (or cannot be allocated) hashing is
aborted. RenameDetector no longer tries to break a modify file pair,
and it does not try to match the file for rename or copy detection.
Change-Id: Ibb4d756844f4667e181e24a34a468dc3655863ac
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
Diffstat (limited to 'org.eclipse.jgit')
3 files changed, 80 insertions, 26 deletions
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java index 66218f6405..9d9a96d8df 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java @@ -57,6 +57,7 @@ import java.util.List; import org.eclipse.jgit.JGitText; import org.eclipse.jgit.diff.DiffEntry.ChangeType; +import org.eclipse.jgit.diff.SimilarityIndex.TableFullException; import org.eclipse.jgit.lib.AbbreviatedObjectId; import org.eclipse.jgit.lib.FileMode; import org.eclipse.jgit.lib.NullProgressMonitor; @@ -445,14 +446,23 @@ public class RenameDetector { private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d) throws IOException { - SimilarityIndex src = new SimilarityIndex(); - src.hash(reader.open(OLD, d)); - src.sort(); - - SimilarityIndex dst = new SimilarityIndex(); - dst.hash(reader.open(NEW, d)); - dst.sort(); - return src.score(dst, 100); + try { + SimilarityIndex src = new SimilarityIndex(); + src.hash(reader.open(OLD, d)); + src.sort(); + + SimilarityIndex dst = new SimilarityIndex(); + dst.hash(reader.open(NEW, d)); + dst.sort(); + return src.score(dst, 100); + } catch (TableFullException tableFull) { + // If either table overflowed while being constructed, don't allow + // the pair to be broken. Returning 1 higher than breakScore will + // ensure its not similar, but not quite dissimilar enough to break. + // + overRenameLimit = true; + return breakScore + 1; + } } private void findContentRenames(ContentSource.Pair reader, @@ -468,6 +478,7 @@ public class RenameDetector { d = new SimilarityRenameDetector(reader, deleted, added); d.setRenameScore(getRenameScore()); d.compute(pm); + overRenameLimit |= d.isTableOverflow(); deleted = d.getLeftOverSources(); added = d.getLeftOverDestinations(); entries.addAll(d.getMatches()); diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java index 8531325895..0453006135 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java @@ -65,8 +65,8 @@ import org.eclipse.jgit.lib.ObjectStream; * file are discovered. */ class SimilarityIndex { - /** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */ - private static final int MAX_HASH_BITS = 17; + /** A special {@link TableFullException} used in place of OutOfMemoryError. */ + private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException(); /** * Shift to apply before storing a key. @@ -82,14 +82,17 @@ class SimilarityIndex { /** Number of non-zero entries in {@link #idHash}. */ private int idSize; + /** {@link #idSize} that triggers {@link #idHash} to double in size. */ + private int idGrowAt; + /** * Pairings of content keys and counters. * <p> * Slots in the table are actually two ints wedged into a single long. The - * upper {@link #MAX_HASH_BITS} bits stores the content key, and the - * remaining lower bits stores the number of bytes associated with that key. - * Empty slots are denoted by 0, which cannot occur because the count cannot - * be 0. Values can only be positive, which we enforce during key addition. + * upper 32 bits stores the content key, and the remaining lower bits stores + * the number of bytes associated with that key. Empty slots are denoted by + * 0, which cannot occur because the count cannot be 0. Values can only be + * positive, which we enforce during key addition. */ private long[] idHash; @@ -99,6 +102,7 @@ class SimilarityIndex { SimilarityIndex() { idHashBits = 8; idHash = new long[1 << idHashBits]; + idGrowAt = growAt(idHashBits); } long getFileSize() { @@ -109,7 +113,8 @@ class SimilarityIndex { fileSize = size; } - void hash(ObjectLoader obj) throws MissingObjectException, IOException { + void hash(ObjectLoader obj) throws MissingObjectException, IOException, + TableFullException { if (obj.isLarge()) { ObjectStream in = obj.openStream(); try { @@ -125,7 +130,7 @@ class SimilarityIndex { } } - void hash(byte[] raw, int ptr, final int end) { + void hash(byte[] raw, int ptr, final int end) throws TableFullException { while (ptr < end) { int hash = 5381; int start = ptr; @@ -141,7 +146,8 @@ class SimilarityIndex { } } - void hash(InputStream in, long remaining) throws IOException { + void hash(InputStream in, long remaining) throws IOException, + TableFullException { byte[] buf = new byte[4096]; int ptr = 0; int cnt = 0; @@ -268,7 +274,7 @@ class SimilarityIndex { return (idHash.length - idSize) + idx; } - void add(int key, int cnt) { + void add(int key, int cnt) throws TableFullException { key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative. int j = slot(key); @@ -276,7 +282,7 @@ class SimilarityIndex { long v = idHash[j]; if (v == 0) { // Empty slot in the table, store here. - if (shouldGrow()) { + if (idGrowAt <= idSize) { grow(); j = slot(key); continue; @@ -304,16 +310,26 @@ class SimilarityIndex { return key >>> (31 - idHashBits); } - private boolean shouldGrow() { - return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2; + private static int growAt(int idHashBits) { + return (1 << idHashBits) * (idHashBits - 3) / idHashBits; } - private void grow() { + private void grow() throws TableFullException { + if (idHashBits == 30) + throw new TableFullException(); + long[] oldHash = idHash; int oldSize = idHash.length; idHashBits++; - idHash = new long[1 << idHashBits]; + idGrowAt = growAt(idHashBits); + + try { + idHash = new long[1 << idHashBits]; + } catch (OutOfMemoryError noMemory) { + throw TABLE_FULL_OUT_OF_MEMORY; + } + for (int i = 0; i < oldSize; i++) { long v = oldHash[i]; if (v != 0) { @@ -333,4 +349,8 @@ class SimilarityIndex { private static int countOf(long v) { return (int) v; } + + static class TableFullException extends Exception { + private static final long serialVersionUID = 1L; + } } diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java index 3075c223a6..89e71e6661 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java @@ -53,6 +53,7 @@ import java.util.List; import org.eclipse.jgit.JGitText; import org.eclipse.jgit.diff.DiffEntry.ChangeType; +import org.eclipse.jgit.diff.SimilarityIndex.TableFullException; import org.eclipse.jgit.lib.FileMode; import org.eclipse.jgit.lib.NullProgressMonitor; import org.eclipse.jgit.lib.ProgressMonitor; @@ -110,6 +111,9 @@ class SimilarityRenameDetector { /** Score a pair must exceed to be considered a rename. */ private int renameScore = 60; + /** Set if any {@link SimilarityIndex.TableFullException} occurs. */ + private boolean tableOverflow; + private List<DiffEntry> out; SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs, @@ -182,6 +186,10 @@ class SimilarityRenameDetector { return dsts; } + boolean isTableOverflow() { + return tableOverflow; + } + private static List<DiffEntry> compactSrcList(List<DiffEntry> in) { ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size()); for (DiffEntry e : in) { @@ -226,7 +234,14 @@ class SimilarityRenameDetector { continue; } - SimilarityIndex s = hash(OLD, srcEnt); + SimilarityIndex s; + try { + s = hash(OLD, srcEnt); + } catch (TableFullException tableFull) { + tableOverflow = true; + continue; + } + for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) { DiffEntry dstEnt = dsts.get(dstIdx); @@ -260,7 +275,15 @@ class SimilarityRenameDetector { continue; } - SimilarityIndex d = hash(NEW, dstEnt); + SimilarityIndex d; + try { + d = hash(NEW, dstEnt); + } catch (TableFullException tableFull) { + tableOverflow = true; + pm.update(1); + continue; + } + int contentScore = s.score(d, 10000); // nameScore returns a value between 0 and 100, but we want it @@ -336,7 +359,7 @@ class SimilarityRenameDetector { } private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent) - throws IOException { + throws IOException, TableFullException { SimilarityIndex r = new SimilarityIndex(); r.hash(reader.open(side, ent)); r.sort(); |