]> source.dussan.org Git - jgit.git/commitdiff
Rename detection should canonicalize line endings 12/38912/1
authorMarc Strapetz <marc.strapetz@syntevo.com>
Thu, 30 Oct 2014 18:36:49 +0000 (19:36 +0100)
committerMatthias Sohn <matthias.sohn@sap.com>
Fri, 2 Jan 2015 22:23:22 +0000 (23:23 +0100)
Native Git canonicalizes line endings when detecting
renames, more specifically it replaces CRLF by LF.
See: hash_chars in diffcore-delta.c

Bug: 449545
Change-Id: Iec2aab12ae9e67074cccb7fbd4d9defe176a0130
Signed-off-by: Marc Strapetz <marc.strapetz@syntevo.com>
Signed-off-by: Matthias Sohn <matthias.sohn@sap.com>
org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java

index 95423609a99858da2737dbc971da234c2e1a9dda..4724677bb89007b90ee82ce97334cece3fac9480 100644 (file)
@@ -83,7 +83,7 @@ public class SimilarityIndexTest {
                                + "B\n" //
                                + "B\n").getBytes("UTF-8");
                SimilarityIndex si = new SimilarityIndex();
-               si.hash(new ByteArrayInputStream(in), in.length);
+               si.hash(new ByteArrayInputStream(in), in.length, false);
                assertEquals(2, si.size());
        }
 
@@ -103,6 +103,48 @@ public class SimilarityIndexTest {
                assertEquals(100, dst.score(src, 100));
        }
 
+       @Test
+       public void testCommonScore_SameFiles_CR_canonicalization()
+                       throws TableFullException {
+               String text = "" //
+                               + "A\r\n" //
+                               + "B\r\n" //
+                               + "D\r\n" //
+                               + "B\r\n";
+               SimilarityIndex src = hash(text);
+               SimilarityIndex dst = hash(text.replace("\r", ""));
+               assertEquals(8, src.common(dst));
+               assertEquals(8, dst.common(src));
+
+               assertEquals(100, src.score(dst, 100));
+               assertEquals(100, dst.score(src, 100));
+       }
+
+       @Test
+       public void testCommonScoreLargeObject_SameFiles_CR_canonicalization()
+                       throws TableFullException, IOException {
+               String text = "" //
+                               + "A\r\n" //
+                               + "B\r\n" //
+                               + "D\r\n" //
+                               + "B\r\n";
+               SimilarityIndex src = new SimilarityIndex();
+               byte[] bytes1 = text.getBytes("UTF-8");
+               src.hash(new ByteArrayInputStream(bytes1), bytes1.length, true);
+               src.sort();
+
+               SimilarityIndex dst = new SimilarityIndex();
+               byte[] bytes2 = text.replace("\r", "").getBytes("UTF-8");
+               dst.hash(new ByteArrayInputStream(bytes2), bytes2.length, true);
+               dst.sort();
+
+               assertEquals(8, src.common(dst));
+               assertEquals(8, dst.common(src));
+
+               assertEquals(100, src.score(dst, 100));
+               assertEquals(100, dst.score(src, 100));
+       }
+
        @Test
        public void testCommonScore_EmptyFiles() throws TableFullException {
                SimilarityIndex src = hash("");
@@ -132,24 +174,8 @@ public class SimilarityIndexTest {
        }
 
        private static SimilarityIndex hash(String text) throws TableFullException {
-               SimilarityIndex src = new SimilarityIndex() {
-                       @Override
-                       void hash(byte[] raw, int ptr, final int end)
-                                       throws TableFullException {
-                               while (ptr < end) {
-                                       int hash = raw[ptr] & 0xff;
-                                       int start = ptr;
-                                       do {
-                                               int c = raw[ptr++] & 0xff;
-                                               if (c == '\n')
-                                                       break;
-                                       } while (ptr < end && ptr - start < 64);
-                                       add(hash, ptr - start);
-                               }
-                       }
-               };
+               SimilarityIndex src = new SimilarityIndex();
                byte[] raw = Constants.encode(text);
-               src.setFileSize(raw.length);
                src.hash(raw, 0, raw.length);
                src.sort();
                return src;
index 17ccb9726fb6668d6cfca6377e793bb4bb4aea51..f376b8e36e415404ed4a1c045a1132208408ccc8 100644 (file)
@@ -79,8 +79,11 @@ class SimilarityIndex {
        /** Maximum value of the count field, also mask to extract the count. */
        private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
 
-       /** Total size of the file we hashed into the structure. */
-       private long fileSize;
+       /**
+        * Total amount of bytes hashed into the structure, including \n. This is
+        * usually the size of the file minus number of CRLF encounters.
+        */
+       private long hashedCnt;
 
        /** Number of non-zero entries in {@link #idHash}. */
        private int idSize;
@@ -108,48 +111,59 @@ class SimilarityIndex {
                idGrowAt = growAt(idHashBits);
        }
 
-       long getFileSize() {
-               return fileSize;
-       }
-
-       void setFileSize(long size) {
-               fileSize = size;
-       }
-
        void hash(ObjectLoader obj) throws MissingObjectException, IOException,
                        TableFullException {
                if (obj.isLarge()) {
-                       ObjectStream in = obj.openStream();
-                       try {
-                               setFileSize(in.getSize());
-                               hash(in, fileSize);
-                       } finally {
-                               in.close();
-                       }
+                       hashLargeObject(obj);
                } else {
                        byte[] raw = obj.getCachedBytes();
-                       setFileSize(raw.length);
                        hash(raw, 0, raw.length);
                }
        }
 
+       private void hashLargeObject(ObjectLoader obj) throws IOException,
+                       TableFullException {
+               ObjectStream in1 = obj.openStream();
+               boolean text;
+               try {
+                       text = !RawText.isBinary(in1);
+               } finally {
+                       in1.close();
+               }
+
+               ObjectStream in2 = obj.openStream();
+               try {
+                       hash(in2, in2.getSize(), text);
+               } finally {
+                       in2.close();
+               }
+       }
+
        void hash(byte[] raw, int ptr, final int end) throws TableFullException {
+               final boolean text = !RawText.isBinary(raw);
+               hashedCnt = 0;
                while (ptr < end) {
                        int hash = 5381;
+                       int blockHashedCnt = 0;
                        int start = ptr;
 
                        // Hash one line, or one block, whichever occurs first.
                        do {
                                int c = raw[ptr++] & 0xff;
+                               // Ignore CR in CRLF sequence if text
+                               if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
+                                       continue;
+                               blockHashedCnt++;
                                if (c == '\n')
                                        break;
                                hash = (hash << 5) + hash + c;
                        } while (ptr < end && ptr - start < 64);
-                       add(hash, ptr - start);
+                       hashedCnt += blockHashedCnt;
+                       add(hash, blockHashedCnt);
                }
        }
 
-       void hash(InputStream in, long remaining) throws IOException,
+       void hash(InputStream in, long remaining, boolean text) throws IOException,
                        TableFullException {
                byte[] buf = new byte[4096];
                int ptr = 0;
@@ -157,6 +171,7 @@ class SimilarityIndex {
 
                while (0 < remaining) {
                        int hash = 5381;
+                       int blockHashedCnt = 0;
 
                        // Hash one line, or one block, whichever occurs first.
                        int n = 0;
@@ -170,11 +185,16 @@ class SimilarityIndex {
 
                                n++;
                                int c = buf[ptr++] & 0xff;
+                               // Ignore CR in CRLF sequence if text
+                               if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
+                                       continue;
+                               blockHashedCnt++;
                                if (c == '\n')
                                        break;
                                hash = (hash << 5) + hash + c;
                        } while (n < 64 && n < remaining);
-                       add(hash, n);
+                       hashedCnt += blockHashedCnt;
+                       add(hash, blockHashedCnt);
                        remaining -= n;
                }
        }
@@ -193,7 +213,7 @@ class SimilarityIndex {
        }
 
        int score(SimilarityIndex dst, int maxScore) {
-               long max = Math.max(fileSize, dst.fileSize);
+               long max = Math.max(hashedCnt, dst.hashedCnt);
                if (max == 0)
                        return maxScore;
                return (int) ((common(dst) * maxScore) / max);