]> source.dussan.org Git - jgit.git/commitdiff
SimilarityIndex: Accept files larger than 8 MB 89/1889/2
authorShawn O. Pearce <spearce@spearce.org>
Thu, 11 Nov 2010 22:10:32 +0000 (14:10 -0800)
committerShawn O. Pearce <spearce@spearce.org>
Fri, 12 Nov 2010 19:56:59 +0000 (11:56 -0800)
Files bigger than 8 MB (2^23 bytes) tended to overflow the internal
hashtable, as the table was capped in size to 2^17 records.  If a
file contained 2^17 unique data blocks/lines, the table insertion
got stuck in an infinite loop as the able couldn't grow, and there
was no open slot for the new item.

Remove the artifical 2^17 table limit and instead allow the table
to grow to be as big as 2^30.  With a 64 byte block size, this
permits hashing inputs as large as 64 GB.

If the table reaches 2^30 (or cannot be allocated) hashing is
aborted.  RenameDetector no longer tries to break a modify file pair,
and it does not try to match the file for rename or copy detection.

Change-Id: Ibb4d756844f4667e181e24a34a468dc3655863ac
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java

index 7e42e53586a22bbe94618097c8b72e8a072d7978..1da5828b34fae2ccdb217b824bdfb53c64023e8d 100644 (file)
@@ -48,10 +48,11 @@ import java.io.IOException;
 
 import junit.framework.TestCase;
 
+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.Constants;
 
 public class SimilarityIndexTest extends TestCase {
-       public void testIndexingSmallObject() {
+       public void testIndexingSmallObject() throws TableFullException {
                SimilarityIndex si = hash("" //
                                + "A\n" //
                                + "B\n" //
@@ -70,7 +71,8 @@ public class SimilarityIndexTest extends TestCase {
                assertEquals(2, si.count(si.findIndex(key_D)));
        }
 
-       public void testIndexingLargeObject() throws IOException {
+       public void testIndexingLargeObject() throws IOException,
+                       TableFullException {
                byte[] in = ("" //
                                + "A\n" //
                                + "B\n" //
@@ -81,7 +83,7 @@ public class SimilarityIndexTest extends TestCase {
                assertEquals(2, si.size());
        }
 
-       public void testCommonScore_SameFiles() {
+       public void testCommonScore_SameFiles() throws TableFullException {
                String text = "" //
                                + "A\n" //
                                + "B\n" //
@@ -96,21 +98,22 @@ public class SimilarityIndexTest extends TestCase {
                assertEquals(100, dst.score(src, 100));
        }
 
-       public void testCommonScore_EmptyFiles() {
+       public void testCommonScore_EmptyFiles() throws TableFullException {
                SimilarityIndex src = hash("");
                SimilarityIndex dst = hash("");
                assertEquals(0, src.common(dst));
                assertEquals(0, dst.common(src));
        }
 
-       public void testCommonScore_TotallyDifferentFiles() {
+       public void testCommonScore_TotallyDifferentFiles()
+                       throws TableFullException {
                SimilarityIndex src = hash("A\n");
                SimilarityIndex dst = hash("D\n");
                assertEquals(0, src.common(dst));
                assertEquals(0, dst.common(src));
        }
 
-       public void testCommonScore_SimiliarBy75() {
+       public void testCommonScore_SimiliarBy75() throws TableFullException {
                SimilarityIndex src = hash("A\nB\nC\nD\n");
                SimilarityIndex dst = hash("A\nB\nC\nQ\n");
                assertEquals(6, src.common(dst));
@@ -120,10 +123,11 @@ public class SimilarityIndexTest extends TestCase {
                assertEquals(75, dst.score(src, 100));
        }
 
-       private static SimilarityIndex hash(String text) {
+       private static SimilarityIndex hash(String text) throws TableFullException {
                SimilarityIndex src = new SimilarityIndex() {
                        @Override
-                       void hash(byte[] raw, int ptr, final int end) {
+                       void hash(byte[] raw, int ptr, final int end)
+                                       throws TableFullException {
                                while (ptr < end) {
                                        int hash = raw[ptr] & 0xff;
                                        int start = ptr;
@@ -143,7 +147,7 @@ public class SimilarityIndexTest extends TestCase {
                return src;
        }
 
-       private static int keyFor(String line) {
+       private static int keyFor(String line) throws TableFullException {
                SimilarityIndex si = hash(line);
                assertEquals("single line scored", 1, si.size());
                return si.key(0);
index 66218f6405ab4a4036bc525f93282ad0f5011467..9d9a96d8dff15d408a31b9e68aa8876c0754befc 100644 (file)
@@ -57,6 +57,7 @@ import java.util.List;
 
 import org.eclipse.jgit.JGitText;
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;
+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.AbbreviatedObjectId;
 import org.eclipse.jgit.lib.FileMode;
 import org.eclipse.jgit.lib.NullProgressMonitor;
@@ -445,14 +446,23 @@ public class RenameDetector {
 
        private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d)
                        throws IOException {
-               SimilarityIndex src = new SimilarityIndex();
-               src.hash(reader.open(OLD, d));
-               src.sort();
-
-               SimilarityIndex dst = new SimilarityIndex();
-               dst.hash(reader.open(NEW, d));
-               dst.sort();
-               return src.score(dst, 100);
+               try {
+                       SimilarityIndex src = new SimilarityIndex();
+                       src.hash(reader.open(OLD, d));
+                       src.sort();
+
+                       SimilarityIndex dst = new SimilarityIndex();
+                       dst.hash(reader.open(NEW, d));
+                       dst.sort();
+                       return src.score(dst, 100);
+               } catch (TableFullException tableFull) {
+                       // If either table overflowed while being constructed, don't allow
+                       // the pair to be broken. Returning 1 higher than breakScore will
+                       // ensure its not similar, but not quite dissimilar enough to break.
+                       //
+                       overRenameLimit = true;
+                       return breakScore + 1;
+               }
        }
 
        private void findContentRenames(ContentSource.Pair reader,
@@ -468,6 +478,7 @@ public class RenameDetector {
                        d = new SimilarityRenameDetector(reader, deleted, added);
                        d.setRenameScore(getRenameScore());
                        d.compute(pm);
+                       overRenameLimit |= d.isTableOverflow();
                        deleted = d.getLeftOverSources();
                        added = d.getLeftOverDestinations();
                        entries.addAll(d.getMatches());
index 853132589589988d4f1780c8568d09f0b2847741..04530061355b6203872d567bd760050ef7425ded 100644 (file)
@@ -65,8 +65,8 @@ import org.eclipse.jgit.lib.ObjectStream;
  * file are discovered.
  */
 class SimilarityIndex {
-       /** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
-       private static final int MAX_HASH_BITS = 17;
+       /** A special {@link TableFullException} used in place of OutOfMemoryError. */
+       private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
 
        /**
         * Shift to apply before storing a key.
@@ -82,14 +82,17 @@ class SimilarityIndex {
        /** Number of non-zero entries in {@link #idHash}. */
        private int idSize;
 
+       /** {@link #idSize} that triggers {@link #idHash} to double in size. */
+       private int idGrowAt;
+
        /**
         * Pairings of content keys and counters.
         * <p>
         * Slots in the table are actually two ints wedged into a single long. The
-        * upper {@link #MAX_HASH_BITS} bits stores the content key, and the
-        * remaining lower bits stores the number of bytes associated with that key.
-        * Empty slots are denoted by 0, which cannot occur because the count cannot
-        * be 0. Values can only be positive, which we enforce during key addition.
+        * upper 32 bits stores the content key, and the remaining lower bits stores
+        * the number of bytes associated with that key. Empty slots are denoted by
+        * 0, which cannot occur because the count cannot be 0. Values can only be
+        * positive, which we enforce during key addition.
         */
        private long[] idHash;
 
@@ -99,6 +102,7 @@ class SimilarityIndex {
        SimilarityIndex() {
                idHashBits = 8;
                idHash = new long[1 << idHashBits];
+               idGrowAt = growAt(idHashBits);
        }
 
        long getFileSize() {
@@ -109,7 +113,8 @@ class SimilarityIndex {
                fileSize = size;
        }
 
-       void hash(ObjectLoader obj) throws MissingObjectException, IOException {
+       void hash(ObjectLoader obj) throws MissingObjectException, IOException,
+                       TableFullException {
                if (obj.isLarge()) {
                        ObjectStream in = obj.openStream();
                        try {
@@ -125,7 +130,7 @@ class SimilarityIndex {
                }
        }
 
-       void hash(byte[] raw, int ptr, final int end) {
+       void hash(byte[] raw, int ptr, final int end) throws TableFullException {
                while (ptr < end) {
                        int hash = 5381;
                        int start = ptr;
@@ -141,7 +146,8 @@ class SimilarityIndex {
                }
        }
 
-       void hash(InputStream in, long remaining) throws IOException {
+       void hash(InputStream in, long remaining) throws IOException,
+                       TableFullException {
                byte[] buf = new byte[4096];
                int ptr = 0;
                int cnt = 0;
@@ -268,7 +274,7 @@ class SimilarityIndex {
                return (idHash.length - idSize) + idx;
        }
 
-       void add(int key, int cnt) {
+       void add(int key, int cnt) throws TableFullException {
                key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
 
                int j = slot(key);
@@ -276,7 +282,7 @@ class SimilarityIndex {
                        long v = idHash[j];
                        if (v == 0) {
                                // Empty slot in the table, store here.
-                               if (shouldGrow()) {
+                               if (idGrowAt <= idSize) {
                                        grow();
                                        j = slot(key);
                                        continue;
@@ -304,16 +310,26 @@ class SimilarityIndex {
                return key >>> (31 - idHashBits);
        }
 
-       private boolean shouldGrow() {
-               return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2;
+       private static int growAt(int idHashBits) {
+               return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
        }
 
-       private void grow() {
+       private void grow() throws TableFullException {
+               if (idHashBits == 30)
+                       throw new TableFullException();
+
                long[] oldHash = idHash;
                int oldSize = idHash.length;
 
                idHashBits++;
-               idHash = new long[1 << idHashBits];
+               idGrowAt = growAt(idHashBits);
+
+               try {
+                       idHash = new long[1 << idHashBits];
+               } catch (OutOfMemoryError noMemory) {
+                       throw TABLE_FULL_OUT_OF_MEMORY;
+               }
+
                for (int i = 0; i < oldSize; i++) {
                        long v = oldHash[i];
                        if (v != 0) {
@@ -333,4 +349,8 @@ class SimilarityIndex {
        private static int countOf(long v) {
                return (int) v;
        }
+
+       static class TableFullException extends Exception {
+               private static final long serialVersionUID = 1L;
+       }
 }
index 3075c223a6642529e8f4cdb02b3cce8b4e6d206e..89e71e666121b1cebfa8d1ac7f8db42a7055e62c 100644 (file)
@@ -53,6 +53,7 @@ import java.util.List;
 
 import org.eclipse.jgit.JGitText;
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;
+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.FileMode;
 import org.eclipse.jgit.lib.NullProgressMonitor;
 import org.eclipse.jgit.lib.ProgressMonitor;
@@ -110,6 +111,9 @@ class SimilarityRenameDetector {
        /** Score a pair must exceed to be considered a rename. */
        private int renameScore = 60;
 
+       /** Set if any {@link SimilarityIndex.TableFullException} occurs. */
+       private boolean tableOverflow;
+
        private List<DiffEntry> out;
 
        SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
@@ -182,6 +186,10 @@ class SimilarityRenameDetector {
                return dsts;
        }
 
+       boolean isTableOverflow() {
+               return tableOverflow;
+       }
+
        private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
                ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
                for (DiffEntry e : in) {
@@ -226,7 +234,14 @@ class SimilarityRenameDetector {
                                continue;
                        }
 
-                       SimilarityIndex s = hash(OLD, srcEnt);
+                       SimilarityIndex s;
+                       try {
+                               s = hash(OLD, srcEnt);
+                       } catch (TableFullException tableFull) {
+                               tableOverflow = true;
+                               continue;
+                       }
+
                        for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
                                DiffEntry dstEnt = dsts.get(dstIdx);
 
@@ -260,7 +275,15 @@ class SimilarityRenameDetector {
                                        continue;
                                }
 
-                               SimilarityIndex d = hash(NEW, dstEnt);
+                               SimilarityIndex d;
+                               try {
+                                       d = hash(NEW, dstEnt);
+                               } catch (TableFullException tableFull) {
+                                       tableOverflow = true;
+                                       pm.update(1);
+                                       continue;
+                               }
+
                                int contentScore = s.score(d, 10000);
 
                                // nameScore returns a value between 0 and 100, but we want it
@@ -336,7 +359,7 @@ class SimilarityRenameDetector {
        }
 
        private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
-                       throws IOException {
+                       throws IOException, TableFullException {
                SimilarityIndex r = new SimilarityIndex();
                r.hash(reader.open(side, ent));
                r.sort();