Added a check in Diff to ensure that files that are most likely not text are not line-by-line diffed. Files are determined to be binary by checking the first 8000 bytes for a null character. This is a similar heuristic to what C Git uses. Change-Id: I2b6f05674c88d89b3f549a5db483f850f7f46c26tags/v0.9.1
+ (mode1.equals(mode2) ? " " + mode1 : "")); | + (mode1.equals(mode2) ? " " + mode1 : "")); | ||||
out.println("--- " + (isNew ? "/dev/null" : name1)); | out.println("--- " + (isNew ? "/dev/null" : name1)); | ||||
out.println("+++ " + (isDelete ? "/dev/null" : name2)); | out.println("+++ " + (isDelete ? "/dev/null" : name2)); | ||||
RawText a = getRawText(id1); | |||||
RawText b = getRawText(id2); | |||||
byte[] aRaw = getRawBytes(id1); | |||||
byte[] bRaw = getRawBytes(id2); | |||||
if (RawText.isBinary(aRaw) || RawText.isBinary(bRaw)) { | |||||
out.println("Binary files differ"); | |||||
return; | |||||
} | |||||
RawText a = getRawText(aRaw); | |||||
RawText b = getRawText(bRaw); | |||||
MyersDiff diff = new MyersDiff(a, b); | MyersDiff diff = new MyersDiff(a, b); | ||||
fmt.formatEdits(out, a, b, diff.getEdits()); | fmt.formatEdits(out, a, b, diff.getEdits()); | ||||
} | } | ||||
private RawText getRawText(ObjectId id) throws IOException { | |||||
private byte[] getRawBytes(ObjectId id) throws IOException { | |||||
if (id.equals(ObjectId.zeroId())) | if (id.equals(ObjectId.zeroId())) | ||||
return new RawText(new byte[] {}); | |||||
byte[] raw = db.openBlob(id).getCachedBytes(); | |||||
return new byte[] {}; | |||||
return db.openBlob(id).getCachedBytes(); | |||||
} | |||||
private RawText getRawText(byte[] raw) { | |||||
if (ignoreWsAll) | if (ignoreWsAll) | ||||
return new RawTextIgnoreAllWhitespace(raw); | return new RawTextIgnoreAllWhitespace(raw); | ||||
else if (ignoreWsTrailing) | else if (ignoreWsTrailing) | ||||
return new RawText(raw); | return new RawText(raw); | ||||
} | } | ||||
} | } | ||||
* they are converting from "line number" to "element index". | * they are converting from "line number" to "element index". | ||||
*/ | */ | ||||
public class RawText implements Sequence { | public class RawText implements Sequence { | ||||
/** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */ | |||||
private static final int FIRST_FEW_BYTES = 8000; | |||||
/** The file content for this sequence. */ | /** The file content for this sequence. */ | ||||
protected final byte[] content; | protected final byte[] content; | ||||
hash = (hash << 5) ^ (raw[ptr] & 0xff); | hash = (hash << 5) ^ (raw[ptr] & 0xff); | ||||
return hash; | return hash; | ||||
} | } | ||||
/** | |||||
* Determine heuristically whether a byte array represents binary (as | |||||
* opposed to text) content. | |||||
* | |||||
* @param raw | |||||
* the raw file content. | |||||
* @return true if raw is likely to be a binary file, false otherwise | |||||
*/ | |||||
public static boolean isBinary(byte[] raw) { | |||||
// Same heuristic as C Git | |||||
int size = raw.length > FIRST_FEW_BYTES ? FIRST_FEW_BYTES : raw.length; | |||||
for (int ptr = 0; ptr < size; ptr++) | |||||
if (raw[ptr] == '\0') | |||||
return true; | |||||
return false; | |||||
} | |||||
} | } |