]> source.dussan.org Git - jgit.git/commitdiff
Added check for binary files while diffing 01/1001/2
authorJeff Schumacher <jeffschu@google.com>
Tue, 29 Jun 2010 23:04:08 +0000 (16:04 -0700)
committerShawn O. Pearce <spearce@spearce.org>
Wed, 30 Jun 2010 00:23:00 +0000 (17:23 -0700)
Added a check in Diff to ensure that files that are most likely
not text are not line-by-line diffed. Files are determined to be
binary by checking the first 8000 bytes for a null character. This
is a similar heuristic to what C Git uses.

Change-Id: I2b6f05674c88d89b3f549a5db483f850f7f46c26

org.eclipse.jgit.pgm/src/org/eclipse/jgit/pgm/Diff.java
org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java

index 931c46d333eefbecbb9b365d4e3e9a9f81ba30e5..fc1e400ab0dd2704ae5ed0845724679cd5130458 100644 (file)
@@ -132,16 +132,28 @@ class Diff extends TextBuiltin {
                        + (mode1.equals(mode2) ? " " + mode1 : ""));
                out.println("--- " + (isNew ?  "/dev/null" : name1));
                out.println("+++ " + (isDelete ?  "/dev/null" : name2));
-               RawText a = getRawText(id1);
-               RawText b = getRawText(id2);
+
+               byte[] aRaw = getRawBytes(id1);
+               byte[] bRaw = getRawBytes(id2);
+
+               if (RawText.isBinary(aRaw) || RawText.isBinary(bRaw)) {
+                       out.println("Binary files differ");
+                       return;
+               }
+
+               RawText a = getRawText(aRaw);
+               RawText b = getRawText(bRaw);
                MyersDiff diff = new MyersDiff(a, b);
                fmt.formatEdits(out, a, b, diff.getEdits());
        }
 
-       private RawText getRawText(ObjectId id) throws IOException {
+       private byte[] getRawBytes(ObjectId id) throws IOException {
                if (id.equals(ObjectId.zeroId()))
-                       return new RawText(new byte[] {});
-               byte[] raw = db.openBlob(id).getCachedBytes();
+                       return new byte[] {};
+               return db.openBlob(id).getCachedBytes();
+       }
+
+       private RawText getRawText(byte[] raw) {
                if (ignoreWsAll)
                        return new RawTextIgnoreAllWhitespace(raw);
                else if (ignoreWsTrailing)
@@ -154,4 +166,3 @@ class Diff extends TextBuiltin {
                        return new RawText(raw);
        }
 }
-
index c785534fbbefc0f3b0747f4050373bdd6204471e..c01cb7ad8e9d633d0d494f5834224f2c9b64c3b4 100644 (file)
@@ -65,6 +65,9 @@ import org.eclipse.jgit.util.RawParseUtils;
  * they are converting from "line number" to "element index".
  */
 public class RawText implements Sequence {
+       /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */
+       private static final int FIRST_FEW_BYTES = 8000;
+
        /** The file content for this sequence. */
        protected final byte[] content;
 
@@ -202,4 +205,22 @@ public class RawText implements Sequence {
                        hash = (hash << 5) ^ (raw[ptr] & 0xff);
                return hash;
        }
+
+       /**
+        * Determine heuristically whether a byte array represents binary (as
+        * opposed to text) content.
+        *
+        * @param raw
+        *            the raw file content.
+        * @return true if raw is likely to be a binary file, false otherwise
+        */
+       public static boolean isBinary(byte[] raw) {
+               // Same heuristic as C Git
+               int size = raw.length > FIRST_FEW_BYTES ? FIRST_FEW_BYTES : raw.length;
+               for (int ptr = 0; ptr < size; ptr++)
+                       if (raw[ptr] == '\0')
+                               return true;
+
+               return false;
+       }
 }