Browse Source

Micro-optimize reduceCommonStartEnd for RawText

This is a faster exact match based form that tries to improve
performance for the common case of the header and trailer of
a text file not changing at all. After this fast path we use
the slower path based on the super class' using equals() to
allow for whitespace ignore modes to still work.

Some simple performance testing showed a major improvement over the
older implementation for a common edit we see in JGit.  The test
compared blob 29a89bc and 372a978, which is the ObjectDirectory.java
file difference in commit 41dd9ed1c0.
The two text files are approximately 22 KiB in size.

  DEFAULT        old   203900 ns
  DEFAULT        new   100400 ns

This new version is 2x faster for the DEFAULT comparator, which does
not treat space specially.  This is because we can now examine a
larger swath of text with fewer instructions per byte compared.  The
older algorithm had to stop at each line break and recompute how to
examine the next line, while the new algorithm only stops when the
first difference is found.

  WS_IGNORE_ALL  old   298500 ns
  WS_IGNORE_ALL  new    63300 ns

Its 4.7x faster for the whitespace ignore comparator, as the common
header and footer do not have a whitespace difference.  Avoiding the
special case handling for whitespace on each byte considered saves a
lot of time.

Since most edits to source code (and other text like files) appears in
the interior of the file, fast elimination of common header/footer
means faster diff throughput.  In the less common case of an actual
header or footer edit, the common header/footer elimination is stopped
rather quickly either way, so there is very little downside to the
optimiation applied here.

Change-Id: I1d501b4c3ff80ed086b20bf12faf51ae62167db7
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
tags/v0.10.1
Shawn O. Pearce 13 years ago
parent
commit
e0970cd1b4

+ 48
- 0
org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RawTextTest.java View File

@@ -46,6 +46,7 @@ package org.eclipse.jgit.diff;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

import junit.framework.TestCase;

@@ -98,4 +99,51 @@ public class RawTextTest extends TestCase {
final byte[] r = o.toByteArray();
assertEquals("", RawParseUtils.decode(r));
}

public void testComparatorReduceCommonStartEnd()
throws UnsupportedEncodingException {
final RawTextComparator c = RawTextComparator.DEFAULT;
Edit e;

e = c.reduceCommonStartEnd(t(""), t(""), new Edit(0, 0, 0, 0));
assertEquals(new Edit(0, 0, 0, 0), e);

e = c.reduceCommonStartEnd(t("a"), t("b"), new Edit(0, 1, 0, 1));
assertEquals(new Edit(0, 1, 0, 1), e);

e = c.reduceCommonStartEnd(t("a"), t("a"), new Edit(0, 1, 0, 1));
assertEquals(new Edit(1, 1, 1, 1), e);

e = c.reduceCommonStartEnd(t("axB"), t("axC"), new Edit(0, 3, 0, 3));
assertEquals(new Edit(2, 3, 2, 3), e);

e = c.reduceCommonStartEnd(t("Bxy"), t("Cxy"), new Edit(0, 3, 0, 3));
assertEquals(new Edit(0, 1, 0, 1), e);

e = c.reduceCommonStartEnd(t("bc"), t("Abc"), new Edit(0, 2, 0, 3));
assertEquals(new Edit(0, 0, 0, 1), e);

e = new Edit(0, 5, 0, 5);
e = c.reduceCommonStartEnd(t("abQxy"), t("abRxy"), e);
assertEquals(new Edit(2, 3, 2, 3), e);

RawText a = new RawText("p\na b\nQ\nc d\n".getBytes("UTF-8"));
RawText b = new RawText("p\na b \nR\n c d \n".getBytes("UTF-8"));
e = new Edit(0, 4, 0, 4);
e = RawTextComparator.WS_IGNORE_ALL.reduceCommonStartEnd(a, b, e);
assertEquals(new Edit(2, 3, 2, 3), e);
}

private static RawText t(String text) {
StringBuilder r = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
r.append(text.charAt(i));
r.append('\n');
}
try {
return new RawText(r.toString().getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
}

+ 61
- 0
org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java View File

@@ -48,6 +48,8 @@ import static org.eclipse.jgit.util.RawCharUtil.isWhitespace;
import static org.eclipse.jgit.util.RawCharUtil.trimLeadingWhitespace;
import static org.eclipse.jgit.util.RawCharUtil.trimTrailingWhitespace;

import org.eclipse.jgit.util.IntList;

/** Equivalence function for {@link RawText}. */
public abstract class RawTextComparator extends SequenceComparator<RawText> {
/** No special treatment. */
@@ -275,6 +277,65 @@ public abstract class RawTextComparator extends SequenceComparator<RawText> {
return seq.hashes[ptr + 1];
}

@Override
public Edit reduceCommonStartEnd(RawText a, RawText b, Edit e) {
// This is a faster exact match based form that tries to improve
// performance for the common case of the header and trailer of
// a text file not changing at all. After this fast path we use
// the slower path based on the super class' using equals() to
// allow for whitespace ignore modes to still work.

if (e.beginA == e.endA || e.beginB == e.endB)
return e;

byte[] aRaw = a.content;
byte[] bRaw = b.content;

int aPtr = a.lines.get(e.beginA + 1);
int bPtr = a.lines.get(e.beginB + 1);

int aEnd = a.lines.get(e.endA + 1);
int bEnd = b.lines.get(e.endB + 1);

// This can never happen, but the JIT doesn't know that. If we
// define this assertion before the tight while loops below it
// should be able to skip the array bound checks on access.
//
if (aPtr < 0 || bPtr < 0 || aEnd > aRaw.length || bEnd > bRaw.length)
throw new ArrayIndexOutOfBoundsException();

while (aPtr < aEnd && bPtr < bEnd && aRaw[aPtr] == bRaw[bPtr]) {
aPtr++;
bPtr++;
}

while (aPtr < aEnd && bPtr < bEnd && aRaw[aEnd - 1] == bRaw[bEnd - 1]) {
aEnd--;
bEnd--;
}

e.beginA = findForwardLine(a.lines, e.beginA, aPtr);
e.beginB = findForwardLine(b.lines, e.beginB, bPtr);

e.endA = findReverseLine(a.lines, e.endA, aEnd);
e.endB = findReverseLine(b.lines, e.endB, bEnd);

return super.reduceCommonStartEnd(a, b, e);
}

private static int findForwardLine(IntList lines, int idx, int ptr) {
final int end = lines.size() - 2;
while (idx < end && lines.get(idx + 2) <= ptr)
idx++;
return idx;
}

private static int findReverseLine(IntList lines, int idx, int ptr) {
while (0 < idx && ptr <= lines.get(idx))
idx--;
return idx;
}

/**
* Compute a hash code for a region.
*

Loading…
Cancel
Save