aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java62
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java64
2 files changed, 86 insertions, 40 deletions
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
index 95423609a9..4724677bb8 100644
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
@@ -83,7 +83,7 @@ public class SimilarityIndexTest {
+ "B\n" //
+ "B\n").getBytes("UTF-8");
SimilarityIndex si = new SimilarityIndex();
- si.hash(new ByteArrayInputStream(in), in.length);
+ si.hash(new ByteArrayInputStream(in), in.length, false);
assertEquals(2, si.size());
}
@@ -104,6 +104,48 @@ public class SimilarityIndexTest {
}
@Test
+ public void testCommonScore_SameFiles_CR_canonicalization()
+ throws TableFullException {
+ String text = "" //
+ + "A\r\n" //
+ + "B\r\n" //
+ + "D\r\n" //
+ + "B\r\n";
+ SimilarityIndex src = hash(text);
+ SimilarityIndex dst = hash(text.replace("\r", ""));
+ assertEquals(8, src.common(dst));
+ assertEquals(8, dst.common(src));
+
+ assertEquals(100, src.score(dst, 100));
+ assertEquals(100, dst.score(src, 100));
+ }
+
+ @Test
+ public void testCommonScoreLargeObject_SameFiles_CR_canonicalization()
+ throws TableFullException, IOException {
+ String text = "" //
+ + "A\r\n" //
+ + "B\r\n" //
+ + "D\r\n" //
+ + "B\r\n";
+ SimilarityIndex src = new SimilarityIndex();
+ byte[] bytes1 = text.getBytes("UTF-8");
+ src.hash(new ByteArrayInputStream(bytes1), bytes1.length, true);
+ src.sort();
+
+ SimilarityIndex dst = new SimilarityIndex();
+ byte[] bytes2 = text.replace("\r", "").getBytes("UTF-8");
+ dst.hash(new ByteArrayInputStream(bytes2), bytes2.length, true);
+ dst.sort();
+
+ assertEquals(8, src.common(dst));
+ assertEquals(8, dst.common(src));
+
+ assertEquals(100, src.score(dst, 100));
+ assertEquals(100, dst.score(src, 100));
+ }
+
+ @Test
public void testCommonScore_EmptyFiles() throws TableFullException {
SimilarityIndex src = hash("");
SimilarityIndex dst = hash("");
@@ -132,24 +174,8 @@ public class SimilarityIndexTest {
}
private static SimilarityIndex hash(String text) throws TableFullException {
- SimilarityIndex src = new SimilarityIndex() {
- @Override
- void hash(byte[] raw, int ptr, final int end)
- throws TableFullException {
- while (ptr < end) {
- int hash = raw[ptr] & 0xff;
- int start = ptr;
- do {
- int c = raw[ptr++] & 0xff;
- if (c == '\n')
- break;
- } while (ptr < end && ptr - start < 64);
- add(hash, ptr - start);
- }
- }
- };
+ SimilarityIndex src = new SimilarityIndex();
byte[] raw = Constants.encode(text);
- src.setFileSize(raw.length);
src.hash(raw, 0, raw.length);
src.sort();
return src;
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
index 17ccb9726f..f376b8e36e 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
@@ -79,8 +79,11 @@ class SimilarityIndex {
/** Maximum value of the count field, also mask to extract the count. */
private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
- /** Total size of the file we hashed into the structure. */
- private long fileSize;
+ /**
+ * Total amount of bytes hashed into the structure, including \n. This is
+ * usually the size of the file minus number of CRLF encounters.
+ */
+ private long hashedCnt;
/** Number of non-zero entries in {@link #idHash}. */
private int idSize;
@@ -108,48 +111,59 @@ class SimilarityIndex {
idGrowAt = growAt(idHashBits);
}
- long getFileSize() {
- return fileSize;
- }
-
- void setFileSize(long size) {
- fileSize = size;
- }
-
void hash(ObjectLoader obj) throws MissingObjectException, IOException,
TableFullException {
if (obj.isLarge()) {
- ObjectStream in = obj.openStream();
- try {
- setFileSize(in.getSize());
- hash(in, fileSize);
- } finally {
- in.close();
- }
+ hashLargeObject(obj);
} else {
byte[] raw = obj.getCachedBytes();
- setFileSize(raw.length);
hash(raw, 0, raw.length);
}
}
+ private void hashLargeObject(ObjectLoader obj) throws IOException,
+ TableFullException {
+ ObjectStream in1 = obj.openStream();
+ boolean text;
+ try {
+ text = !RawText.isBinary(in1);
+ } finally {
+ in1.close();
+ }
+
+ ObjectStream in2 = obj.openStream();
+ try {
+ hash(in2, in2.getSize(), text);
+ } finally {
+ in2.close();
+ }
+ }
+
void hash(byte[] raw, int ptr, final int end) throws TableFullException {
+ final boolean text = !RawText.isBinary(raw);
+ hashedCnt = 0;
while (ptr < end) {
int hash = 5381;
+ int blockHashedCnt = 0;
int start = ptr;
// Hash one line, or one block, whichever occurs first.
do {
int c = raw[ptr++] & 0xff;
+ // Ignore CR in CRLF sequence if text
+ if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
+ continue;
+ blockHashedCnt++;
if (c == '\n')
break;
hash = (hash << 5) + hash + c;
} while (ptr < end && ptr - start < 64);
- add(hash, ptr - start);
+ hashedCnt += blockHashedCnt;
+ add(hash, blockHashedCnt);
}
}
- void hash(InputStream in, long remaining) throws IOException,
+ void hash(InputStream in, long remaining, boolean text) throws IOException,
TableFullException {
byte[] buf = new byte[4096];
int ptr = 0;
@@ -157,6 +171,7 @@ class SimilarityIndex {
while (0 < remaining) {
int hash = 5381;
+ int blockHashedCnt = 0;
// Hash one line, or one block, whichever occurs first.
int n = 0;
@@ -170,11 +185,16 @@ class SimilarityIndex {
n++;
int c = buf[ptr++] & 0xff;
+ // Ignore CR in CRLF sequence if text
+ if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
+ continue;
+ blockHashedCnt++;
if (c == '\n')
break;
hash = (hash << 5) + hash + c;
} while (n < 64 && n < remaining);
- add(hash, n);
+ hashedCnt += blockHashedCnt;
+ add(hash, blockHashedCnt);
remaining -= n;
}
}
@@ -193,7 +213,7 @@ class SimilarityIndex {
}
int score(SimilarityIndex dst, int maxScore) {
- long max = Math.max(fileSize, dst.fileSize);
+ long max = Math.max(hashedCnt, dst.hashedCnt);
if (max == 0)
return maxScore;
return (int) ((common(dst) * maxScore) / max);