SimilarityIndex: Accept files larger than 8 MB

Files bigger than 8 MB (2^23 bytes) tended to overflow the internal hashtable, as the table was capped in size to 2^17 records. If a file contained 2^17 unique data blocks/lines, the table insertion got stuck in an infinite loop as the able couldn't grow, and there was no open slot for the new item. Remove the artifical 2^17 table limit and instead allow the table to grow to be as big as 2^30. With a 64 byte block size, this permits hashing inputs as large as 64 GB. If the table reaches 2^30 (or cannot be allocated) hashing is aborted. RenameDetector no longer tries to break a modify file pair, and it does not try to match the file for rename or copy detection. Change-Id: Ibb4d756844f4667e181e24a34a468dc3655863ac Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
author: Shawn O. Pearce <spearce@spearce.org> 2010-11-11 14:10:32 -0800
committer: Shawn O. Pearce <spearce@spearce.org> 2010-11-12 11:56:59 -0800
commit: d63887127e20c0a70c53c48a9aa5ffbdb1cf8873 (patch)
tree: d684d159514531211193a5e84963ef21b45db001 /org.eclipse.jgit
parent: f3b511568b5f6af219bd9c948092bbec46098484 (diff)
download: jgit-d63887127e20c0a70c53c48a9aa5ffbdb1cf8873.tar.gz
jgit-d63887127e20c0a70c53c48a9aa5ffbdb1cf8873.zip
3 files changed, 80 insertions, 26 deletions
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
index 66218f6405..9d9a96d8df 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
@@ -57,6 +57,7 @@ import java.util.List;
 
 import org.eclipse.jgit.JGitText;
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;
+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.AbbreviatedObjectId;
 import org.eclipse.jgit.lib.FileMode;
 import org.eclipse.jgit.lib.NullProgressMonitor;
@@ -445,14 +446,23 @@ public class RenameDetector {
 
 	private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d)
 			throws IOException {
-		SimilarityIndex src = new SimilarityIndex();
-		src.hash(reader.open(OLD, d));
-		src.sort();
-
-		SimilarityIndex dst = new SimilarityIndex();
-		dst.hash(reader.open(NEW, d));
-		dst.sort();
-		return src.score(dst, 100);
+		try {
+			SimilarityIndex src = new SimilarityIndex();
+			src.hash(reader.open(OLD, d));
+			src.sort();
+
+			SimilarityIndex dst = new SimilarityIndex();
+			dst.hash(reader.open(NEW, d));
+			dst.sort();
+			return src.score(dst, 100);
+		} catch (TableFullException tableFull) {
+			// If either table overflowed while being constructed, don't allow
+			// the pair to be broken. Returning 1 higher than breakScore will
+			// ensure its not similar, but not quite dissimilar enough to break.
+			//
+			overRenameLimit = true;
+			return breakScore + 1;
+		}
 	}
 
 	private void findContentRenames(ContentSource.Pair reader,
@@ -468,6 +478,7 @@ public class RenameDetector {
 			d = new SimilarityRenameDetector(reader, deleted, added);
 			d.setRenameScore(getRenameScore());
 			d.compute(pm);
+			overRenameLimit |= d.isTableOverflow();
 			deleted = d.getLeftOverSources();
 			added = d.getLeftOverDestinations();
 			entries.addAll(d.getMatches());
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
index 8531325895..0453006135 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
@@ -65,8 +65,8 @@ import org.eclipse.jgit.lib.ObjectStream;
  * file are discovered.
  */
 class SimilarityIndex {
-	/** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
-	private static final int MAX_HASH_BITS = 17;
+	/** A special {@link TableFullException} used in place of OutOfMemoryError. */
+	private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
 
 	/**
 	 * Shift to apply before storing a key.
@@ -82,14 +82,17 @@ class SimilarityIndex {
 	/** Number of non-zero entries in {@link #idHash}. */
 	private int idSize;
 
+	/** {@link #idSize} that triggers {@link #idHash} to double in size. */
+	private int idGrowAt;
+
 	/**
 	 * Pairings of content keys and counters.
 	 * <p>
 	 * Slots in the table are actually two ints wedged into a single long. The
-	 * upper {@link #MAX_HASH_BITS} bits stores the content key, and the
-	 * remaining lower bits stores the number of bytes associated with that key.
-	 * Empty slots are denoted by 0, which cannot occur because the count cannot
-	 * be 0. Values can only be positive, which we enforce during key addition.
+	 * upper 32 bits stores the content key, and the remaining lower bits stores
+	 * the number of bytes associated with that key. Empty slots are denoted by
+	 * 0, which cannot occur because the count cannot be 0. Values can only be
+	 * positive, which we enforce during key addition.
 	 */
 	private long[] idHash;
 
@@ -99,6 +102,7 @@ class SimilarityIndex {
 	SimilarityIndex() {
 		idHashBits = 8;
 		idHash = new long[1 << idHashBits];
+		idGrowAt = growAt(idHashBits);
 	}
 
 	long getFileSize() {
@@ -109,7 +113,8 @@ class SimilarityIndex {
 		fileSize = size;
 	}
 
-	void hash(ObjectLoader obj) throws MissingObjectException, IOException {
+	void hash(ObjectLoader obj) throws MissingObjectException, IOException,
+			TableFullException {
 		if (obj.isLarge()) {
 			ObjectStream in = obj.openStream();
 			try {
@@ -125,7 +130,7 @@ class SimilarityIndex {
 		}
 	}
 
-	void hash(byte[] raw, int ptr, final int end) {
+	void hash(byte[] raw, int ptr, final int end) throws TableFullException {
 		while (ptr < end) {
 			int hash = 5381;
 			int start = ptr;
@@ -141,7 +146,8 @@ class SimilarityIndex {
 		}
 	}
 
-	void hash(InputStream in, long remaining) throws IOException {
+	void hash(InputStream in, long remaining) throws IOException,
+			TableFullException {
 		byte[] buf = new byte[4096];
 		int ptr = 0;
 		int cnt = 0;
@@ -268,7 +274,7 @@ class SimilarityIndex {
 		return (idHash.length - idSize) + idx;
 	}
 
-	void add(int key, int cnt) {
+	void add(int key, int cnt) throws TableFullException {
 		key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
 
 		int j = slot(key);
@@ -276,7 +282,7 @@ class SimilarityIndex {
 			long v = idHash[j];
 			if (v == 0) {
 				// Empty slot in the table, store here.
-				if (shouldGrow()) {
+				if (idGrowAt <= idSize) {
 					grow();
 					j = slot(key);
 					continue;
@@ -304,16 +310,26 @@ class SimilarityIndex {
 		return key >>> (31 - idHashBits);
 	}
 
-	private boolean shouldGrow() {
-		return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2;
+	private static int growAt(int idHashBits) {
+		return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
 	}
 
-	private void grow() {
+	private void grow() throws TableFullException {
+		if (idHashBits == 30)
+			throw new TableFullException();
+
 		long[] oldHash = idHash;
 		int oldSize = idHash.length;
 
 		idHashBits++;
-		idHash = new long[1 << idHashBits];
+		idGrowAt = growAt(idHashBits);
+
+		try {
+			idHash = new long[1 << idHashBits];
+		} catch (OutOfMemoryError noMemory) {
+			throw TABLE_FULL_OUT_OF_MEMORY;
+		}
+
 		for (int i = 0; i < oldSize; i++) {
 			long v = oldHash[i];
 			if (v != 0) {
@@ -333,4 +349,8 @@ class SimilarityIndex {
 	private static int countOf(long v) {
 		return (int) v;
 	}
+
+	static class TableFullException extends Exception {
+		private static final long serialVersionUID = 1L;
+	}
 }
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
index 3075c223a6..89e71e6661 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
@@ -53,6 +53,7 @@ import java.util.List;
 
 import org.eclipse.jgit.JGitText;
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;
+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.FileMode;
 import org.eclipse.jgit.lib.NullProgressMonitor;
 import org.eclipse.jgit.lib.ProgressMonitor;
@@ -110,6 +111,9 @@ class SimilarityRenameDetector {
 	/** Score a pair must exceed to be considered a rename. */
 	private int renameScore = 60;
 
+	/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
+	private boolean tableOverflow;
+
 	private List<DiffEntry> out;
 
 	SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
@@ -182,6 +186,10 @@ class SimilarityRenameDetector {
 		return dsts;
 	}
 
+	boolean isTableOverflow() {
+		return tableOverflow;
+	}
+
 	private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
 		ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
 		for (DiffEntry e : in) {
@@ -226,7 +234,14 @@ class SimilarityRenameDetector {
 				continue;
 			}
 
-			SimilarityIndex s = hash(OLD, srcEnt);
+			SimilarityIndex s;
+			try {
+				s = hash(OLD, srcEnt);
+			} catch (TableFullException tableFull) {
+				tableOverflow = true;
+				continue;
+			}
+
 			for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
 				DiffEntry dstEnt = dsts.get(dstIdx);
 
@@ -260,7 +275,15 @@ class SimilarityRenameDetector {
 					continue;
 				}
 
-				SimilarityIndex d = hash(NEW, dstEnt);
+				SimilarityIndex d;
+				try {
+					d = hash(NEW, dstEnt);
+				} catch (TableFullException tableFull) {
+					tableOverflow = true;
+					pm.update(1);
+					continue;
+				}
+
 				int contentScore = s.score(d, 10000);
 
 				// nameScore returns a value between 0 and 100, but we want it
@@ -336,7 +359,7 @@ class SimilarityRenameDetector {
 	}
 
 	private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
-			throws IOException {
+			throws IOException, TableFullException {
 		SimilarityIndex r = new SimilarityIndex();
 		r.hash(reader.open(side, ent));
 		r.sort();
author	Shawn O. Pearce <spearce@spearce.org>	2010-11-11 14:10:32 -0800
committer	Shawn O. Pearce <spearce@spearce.org>	2010-11-12 11:56:59 -0800
commit	d63887127e20c0a70c53c48a9aa5ffbdb1cf8873 (patch)
tree	d684d159514531211193a5e84963ef21b45db001 /org.eclipse.jgit
parent	f3b511568b5f6af219bd9c948092bbec46098484 (diff)
download	jgit-d63887127e20c0a70c53c48a9aa5ffbdb1cf8873.tar.gz jgit-d63887127e20c0a70c53c48a9aa5ffbdb1cf8873.zip