HistogramDiff outperforms it for any case where PatienceDiff needs to fallback to another algorithm. Consequently it's not worth keeping around, because we would always want a fallback enabled. Change-Id: I39b99cb1db4b3be74a764dd3d68cd4c9ecd91481 Signed-off-by: Shawn O. Pearce <spearce@spearce.org>tags/v0.10.1
import org.eclipse.jgit.diff.DiffAlgorithm; | import org.eclipse.jgit.diff.DiffAlgorithm; | ||||
import org.eclipse.jgit.diff.HistogramDiff; | import org.eclipse.jgit.diff.HistogramDiff; | ||||
import org.eclipse.jgit.diff.MyersDiff; | import org.eclipse.jgit.diff.MyersDiff; | ||||
import org.eclipse.jgit.diff.PatienceDiff; | |||||
import org.eclipse.jgit.diff.RawText; | import org.eclipse.jgit.diff.RawText; | ||||
import org.eclipse.jgit.diff.RawTextComparator; | import org.eclipse.jgit.diff.RawTextComparator; | ||||
import org.eclipse.jgit.errors.LargeObjectException; | import org.eclipse.jgit.errors.LargeObjectException; | ||||
} | } | ||||
}; | }; | ||||
final Algorithm patience = new Algorithm() { | |||||
DiffAlgorithm create() { | |||||
PatienceDiff d = new PatienceDiff(); | |||||
d.setFallbackAlgorithm(null); | |||||
return d; | |||||
} | |||||
}; | |||||
final Algorithm patience_myers = new Algorithm() { | |||||
DiffAlgorithm create() { | |||||
PatienceDiff d = new PatienceDiff(); | |||||
d.setFallbackAlgorithm(MyersDiff.INSTANCE); | |||||
return d; | |||||
} | |||||
}; | |||||
final Algorithm patience_histogram_myers = new Algorithm() { | |||||
DiffAlgorithm create() { | |||||
HistogramDiff d2 = new HistogramDiff(); | |||||
d2.setFallbackAlgorithm(MyersDiff.INSTANCE); | |||||
PatienceDiff d1 = new PatienceDiff(); | |||||
d1.setFallbackAlgorithm(d2); | |||||
return d1; | |||||
} | |||||
}; | |||||
// ----------------------------------------------------------------------- | // ----------------------------------------------------------------------- | ||||
// | // | ||||
// Implementation of the suite lives below this line. | // Implementation of the suite lives below this line. |
/* | |||||
* Copyright (C) 2010, Google Inc. | |||||
* and other copyright owners as documented in the project's IP log. | |||||
* | |||||
* This program and the accompanying materials are made available | |||||
* under the terms of the Eclipse Distribution License v1.0 which | |||||
* accompanies this distribution, is reproduced below, and is | |||||
* available at http://www.eclipse.org/org/documents/edl-v10.php | |||||
* | |||||
* All rights reserved. | |||||
* | |||||
* Redistribution and use in source and binary forms, with or | |||||
* without modification, are permitted provided that the following | |||||
* conditions are met: | |||||
* | |||||
* - Redistributions of source code must retain the above copyright | |||||
* notice, this list of conditions and the following disclaimer. | |||||
* | |||||
* - Redistributions in binary form must reproduce the above | |||||
* copyright notice, this list of conditions and the following | |||||
* disclaimer in the documentation and/or other materials provided | |||||
* with the distribution. | |||||
* | |||||
* - Neither the name of the Eclipse Foundation, Inc. nor the | |||||
* names of its contributors may be used to endorse or promote | |||||
* products derived from this software without specific prior | |||||
* written permission. | |||||
* | |||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND | |||||
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, | |||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | |||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*/ | |||||
package org.eclipse.jgit.diff; | |||||
import org.eclipse.jgit.diff.DiffPerformanceTest.CharArray; | |||||
import org.eclipse.jgit.diff.DiffPerformanceTest.CharCmp; | |||||
public class PatienceDiffTest extends AbstractDiffTestCase { | |||||
@Override | |||||
protected DiffAlgorithm algorithm() { | |||||
PatienceDiff pd = new PatienceDiff(); | |||||
pd.setFallbackAlgorithm(null); | |||||
return pd; | |||||
} | |||||
public void testEdit_NoUniqueMiddleSideA() { | |||||
EditList r = diff(t("aRRSSz"), t("aSSRRz")); | |||||
assertEquals(1, r.size()); | |||||
assertEquals(new Edit(1, 5, 1, 5), r.get(0)); | |||||
} | |||||
public void testEdit_NoUniqueMiddleSideB() { | |||||
EditList r = diff(t("aRSz"), t("aSSRRz")); | |||||
assertEquals(1, r.size()); | |||||
assertEquals(new Edit(1, 3, 1, 5), r.get(0)); | |||||
} | |||||
public void testPerformanceTestDeltaLength() { | |||||
String a = DiffTestDataGenerator.generateSequence(40000, 971, 3); | |||||
String b = DiffTestDataGenerator.generateSequence(40000, 1621, 5); | |||||
CharArray ac = new CharArray(a); | |||||
CharArray bc = new CharArray(b); | |||||
EditList r = algorithm().diff(new CharCmp(), ac, bc); | |||||
assertEquals(25, r.size()); | |||||
} | |||||
} |
/* | |||||
* Copyright (C) 2010, Google Inc. | |||||
* and other copyright owners as documented in the project's IP log. | |||||
* | |||||
* This program and the accompanying materials are made available | |||||
* under the terms of the Eclipse Distribution License v1.0 which | |||||
* accompanies this distribution, is reproduced below, and is | |||||
* available at http://www.eclipse.org/org/documents/edl-v10.php | |||||
* | |||||
* All rights reserved. | |||||
* | |||||
* Redistribution and use in source and binary forms, with or | |||||
* without modification, are permitted provided that the following | |||||
* conditions are met: | |||||
* | |||||
* - Redistributions of source code must retain the above copyright | |||||
* notice, this list of conditions and the following disclaimer. | |||||
* | |||||
* - Redistributions in binary form must reproduce the above | |||||
* copyright notice, this list of conditions and the following | |||||
* disclaimer in the documentation and/or other materials provided | |||||
* with the distribution. | |||||
* | |||||
* - Neither the name of the Eclipse Foundation, Inc. nor the | |||||
* names of its contributors may be used to endorse or promote | |||||
* products derived from this software without specific prior | |||||
* written permission. | |||||
* | |||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND | |||||
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, | |||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | |||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*/ | |||||
package org.eclipse.jgit.diff; | |||||
/** | |||||
* An implementation of the patience difference algorithm. | |||||
* | |||||
* This implementation was derived by using the 4 rules that are outlined in | |||||
* Bram Cohen's <a href="http://bramcohen.livejournal.com/73318.html">blog</a>. | |||||
* | |||||
* Because this algorithm requires finding a unique common point to center the | |||||
* longest common subsequence around, input sequences which have no unique | |||||
* elements create a degenerate Edit that simply replaces all of one sequence | |||||
* with all of the other sequence. For many source code files and other human | |||||
* maintained text, this isn't likely to occur. When it does occur, it can be | |||||
* easier to read the resulting large-scale replace than to navigate through a | |||||
* lot of slices of common-but-not-unique lines, like curly braces on lone | |||||
* lines, or XML close tags. Consequently this algorithm is willing to create a | |||||
* degenerate Edit in the worst case, in exchange for what may still be | |||||
* perceived to be an easier to read patch script. | |||||
* | |||||
* In a nutshell, the implementation defines an Edit that replaces all of | |||||
* sequence {@code a} with all of {@code b}. This Edit is reduced and/or split | |||||
* to remove common elements, until only Edits spanning non-common elements | |||||
* remain. Those {@link Edit}s are the differences. | |||||
* | |||||
* A slightly more detailed description of the implementation is: | |||||
* | |||||
* <ol> | |||||
* <li>Define an Edit that spans the entire two sequences. This edit replaces | |||||
* all of {@code a} with all of {@code b}.</li> | |||||
* | |||||
* <li>Shrink the Edit by shifting the starting points later in the sequence to | |||||
* skip over any elements that are common between {@code a} and {@code b}. | |||||
* Likewise shift the ending points earlier in the sequence to skip any trailing | |||||
* elements that are common. The first and last element of the edit are now not | |||||
* common, however there may be common content within the interior of the Edit | |||||
* that hasn't been discovered yet.</li> | |||||
* | |||||
* <li>Find unique elements within the Edit region that are in both sequences. | |||||
* This is currently accomplished by hashing the elements and merging them | |||||
* through a custom hash table in {@link PatienceDiffIndex}.</li> | |||||
* | |||||
* <li>Order the common unique elements by their position within {@code b}.</li> | |||||
* | |||||
* <li>For each unique element, stretch an Edit around it in both directions, | |||||
* consuming neighboring elements that are common to both sequences. Select the | |||||
* longest such Edit out of the unique element list. During this stretching, | |||||
* some subsequent unique elements may be consumed into an earlier's common | |||||
* Edit. This means not all unique elements are evaluated.</li> | |||||
* | |||||
* <li>Split the Edit region at the longest common edit. Because step 2 shrank | |||||
* the initial region, there must be at least one element before, and at least | |||||
* one element after the split.</li> | |||||
* | |||||
* <li>Recurse on the before and after split points, starting from step 3. Step | |||||
* 2 doesn't need to be done again because any common part was already removed | |||||
* by the prior step 2 or 5.</li> | |||||
* </ol> | |||||
*/ | |||||
public class PatienceDiff extends DiffAlgorithm { | |||||
/** Algorithm we use when there are no common unique lines in a region. */ | |||||
private DiffAlgorithm fallback; | |||||
/** | |||||
* Set the algorithm used when there are no common unique lines remaining. | |||||
* | |||||
* @param alg | |||||
* the secondary algorithm. If null the region will be denoted as | |||||
* a single REPLACE block. | |||||
*/ | |||||
public void setFallbackAlgorithm(DiffAlgorithm alg) { | |||||
fallback = alg; | |||||
} | |||||
public <S extends Sequence> EditList diffNonCommon( | |||||
SequenceComparator<? super S> cmp, S a, S b) { | |||||
State<S> s = new State<S>(new HashedSequencePair<S>(cmp, a, b)); | |||||
s.diffReplace(new Edit(0, s.a.size(), 0, s.b.size()), null, 0, 0); | |||||
return s.edits; | |||||
} | |||||
private class State<S extends Sequence> { | |||||
private final HashedSequenceComparator<S> cmp; | |||||
private final HashedSequence<S> a; | |||||
private final HashedSequence<S> b; | |||||
/** Result edits we have determined that must be made to convert a to b. */ | |||||
final EditList edits; | |||||
State(HashedSequencePair<S> p) { | |||||
this.cmp = p.getComparator(); | |||||
this.a = p.getA(); | |||||
this.b = p.getB(); | |||||
this.edits = new EditList(); | |||||
} | |||||
void diffReplace(Edit r, long[] pCommon, int pIdx, int pEnd) { | |||||
PatienceDiffIndex<S> p; | |||||
Edit lcs; | |||||
p = new PatienceDiffIndex<S>(cmp, a, b, r, pCommon, pIdx, pEnd); | |||||
lcs = p.findLongestCommonSequence(); | |||||
if (lcs != null) { | |||||
pCommon = p.nCommon; | |||||
pIdx = p.cIdx; | |||||
pEnd = p.nCnt; | |||||
p = null; | |||||
diff(r.before(lcs), pCommon, 0, pIdx); | |||||
diff(r.after(lcs), pCommon, pIdx + 1, pEnd); | |||||
} else if (fallback != null) { | |||||
pCommon = null; | |||||
p = null; | |||||
SubsequenceComparator<HashedSequence<S>> cs = subcmp(); | |||||
Subsequence<HashedSequence<S>> as = Subsequence.a(a, r); | |||||
Subsequence<HashedSequence<S>> bs = Subsequence.b(b, r); | |||||
EditList res = fallback.diffNonCommon(cs, as, bs); | |||||
edits.addAll(Subsequence.toBase(res, as, bs)); | |||||
} else { | |||||
edits.add(r); | |||||
} | |||||
} | |||||
private void diff(Edit r, long[] pCommon, int pIdx, int pEnd) { | |||||
switch (r.getType()) { | |||||
case INSERT: | |||||
case DELETE: | |||||
edits.add(r); | |||||
break; | |||||
case REPLACE: | |||||
diffReplace(r, pCommon, pIdx, pEnd); | |||||
break; | |||||
case EMPTY: | |||||
break; | |||||
default: | |||||
throw new IllegalStateException(); | |||||
} | |||||
} | |||||
private SubsequenceComparator<HashedSequence<S>> subcmp() { | |||||
return new SubsequenceComparator<HashedSequence<S>>(cmp); | |||||
} | |||||
} | |||||
} |
/* | |||||
* Copyright (C) 2010, Google Inc. | |||||
* and other copyright owners as documented in the project's IP log. | |||||
* | |||||
* This program and the accompanying materials are made available | |||||
* under the terms of the Eclipse Distribution License v1.0 which | |||||
* accompanies this distribution, is reproduced below, and is | |||||
* available at http://www.eclipse.org/org/documents/edl-v10.php | |||||
* | |||||
* All rights reserved. | |||||
* | |||||
* Redistribution and use in source and binary forms, with or | |||||
* without modification, are permitted provided that the following | |||||
* conditions are met: | |||||
* | |||||
* - Redistributions of source code must retain the above copyright | |||||
* notice, this list of conditions and the following disclaimer. | |||||
* | |||||
* - Redistributions in binary form must reproduce the above | |||||
* copyright notice, this list of conditions and the following | |||||
* disclaimer in the documentation and/or other materials provided | |||||
* with the distribution. | |||||
* | |||||
* - Neither the name of the Eclipse Foundation, Inc. nor the | |||||
* names of its contributors may be used to endorse or promote | |||||
* products derived from this software without specific prior | |||||
* written permission. | |||||
* | |||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND | |||||
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, | |||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | |||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*/ | |||||
package org.eclipse.jgit.diff; | |||||
/** | |||||
* Supports {@link PatienceDiff} by finding unique but common elements. | |||||
* | |||||
* This index object is constructed once for each region being considered by the | |||||
* main {@link PatienceDiff} algorithm, which really means its once for each | |||||
* recursive step. Each index instance processes a fixed sized region from the | |||||
* sequences, and during recursion the region is split into two smaller segments | |||||
* and processed again. | |||||
* | |||||
* Index instances from a higher level invocation message some state into a | |||||
* lower level invocation by passing the {@link #nCommon} array from the higher | |||||
* invocation into the two sub-steps as {@link #pCommon}. This permits some | |||||
* matching work that was already done in the higher invocation to be reused in | |||||
* the sub-step and can save a lot of time when element equality is expensive. | |||||
* | |||||
* @param <S> | |||||
* type of sequence the scanner will scan. | |||||
*/ | |||||
final class PatienceDiffIndex<S extends Sequence> { | |||||
private static final int A_DUPLICATE = 1; | |||||
private static final int B_DUPLICATE = 2; | |||||
private static final int DUPLICATE_MASK = B_DUPLICATE | A_DUPLICATE; | |||||
private static final int A_SHIFT = 2; | |||||
private static final int B_SHIFT = 31 + 2; | |||||
private static final int PTR_MASK = 0x7fffffff; | |||||
private final HashedSequenceComparator<S> cmp; | |||||
private final HashedSequence<S> a; | |||||
private final HashedSequence<S> b; | |||||
private final Edit region; | |||||
/** Pairs of beginB, endB indices previously found to be common and unique. */ | |||||
private final long[] pCommon; | |||||
/** First valid index in {@link #pCommon}. */ | |||||
private final int pBegin; | |||||
/** 1 past the last valid entry in {@link #pCommon}. */ | |||||
private final int pEnd; | |||||
/** Keyed by {@link #hash(HashedSequence, int)} to get an entry offset. */ | |||||
private final int[] table; | |||||
/** Number of low bits to discard from a key to index {@link #table}. */ | |||||
private final int keyShift; | |||||
// To save memory the buckets for hash chains are stored in correlated | |||||
// arrays. This permits us to get 3 values per entry, without paying | |||||
// the penalty for an object header on each entry. | |||||
/** | |||||
* A matched (or partially examined) element from the two sequences. | |||||
* | |||||
* This is actually a 4-tuple: (bPtr, aPtrP1, bDuplicate, aDuplicate). | |||||
* | |||||
* bPtr and aPtr are each 31 bits. bPtr is exactly the position in the b | |||||
* sequence, while aPtrP1 is {@code aPtr + 1}. This permits us to determine | |||||
* if there is corresponding element in a by testing for aPtrP1 != 0. If it | |||||
* equals 0, there is no element in a. If it equals 1, element 0 of a | |||||
* matches with element bPtr of b. | |||||
* | |||||
* bDuplicate is 1 if this element occurs more than once in b; likewise | |||||
* aDuplicate is 1 if this element occurs more than once in a. These flags | |||||
* permit each element to only be added to the index once. As the duplicates | |||||
* are the low 2 bits a unique record meets (@code (rec & 2) == 0}. | |||||
*/ | |||||
private final long[] ptrs; | |||||
/** Array index of the next entry in the table; 0 if at end of chain. */ | |||||
private final int[] next; | |||||
/** Total number of entries that exist in {@link #ptrs}. */ | |||||
private int entryCnt; | |||||
/** Number of entries in {@link #ptrs} that are actually unique. */ | |||||
private int uniqueCommonCnt; | |||||
/** | |||||
* Pairs of beginB, endB indices found to be common and unique. | |||||
* | |||||
* In order to find the longest common (but unique) sequence within a | |||||
* region, we also found all of the other common but unique sequences in | |||||
* that same region. This array stores all of those results, allowing them | |||||
* to be passed into the subsequent recursive passes so we can later reuse | |||||
* these matches and avoid recomputing the same points again. | |||||
*/ | |||||
long[] nCommon; | |||||
/** Number of items in {@link #nCommon}. */ | |||||
int nCnt; | |||||
/** Index of the longest common subsequence in {@link #nCommon}. */ | |||||
int cIdx; | |||||
PatienceDiffIndex(HashedSequenceComparator<S> cmp, // | |||||
HashedSequence<S> a, // | |||||
HashedSequence<S> b, // | |||||
Edit region, // | |||||
long[] pCommon, int pIdx, int pCnt) { | |||||
this.cmp = cmp; | |||||
this.a = a; | |||||
this.b = b; | |||||
this.region = region; | |||||
this.pCommon = pCommon; | |||||
this.pBegin = pIdx; | |||||
this.pEnd = pCnt; | |||||
final int sz = region.getLengthB(); | |||||
final int tableBits = tableBits(sz); | |||||
table = new int[1 << tableBits]; | |||||
keyShift = 32 - tableBits; | |||||
// As we insert elements we preincrement so that 0 is never a | |||||
// valid entry. Therefore we have to allocate one extra space. | |||||
// | |||||
ptrs = new long[1 + sz]; | |||||
next = new int[ptrs.length]; | |||||
} | |||||
/** | |||||
* Index elements in sequence B for later matching with sequence A. | |||||
* | |||||
* This is the first stage of preparing an index to find the longest common | |||||
* sequence. Elements of sequence B in the range [ptr, end) are scanned in | |||||
* order and added to the internal hashtable. | |||||
* | |||||
* If prior matches were given in the constructor, these may be used to | |||||
* fast-forward through sections of B to avoid unnecessary recomputation. | |||||
*/ | |||||
private void scanB() { | |||||
// We insert in ascending order so that a later scan of the table | |||||
// from 0 through entryCnt will iterate through B in order. This | |||||
// is the desired result ordering from match(). | |||||
// | |||||
int ptr = region.beginB; | |||||
final int end = region.endB; | |||||
int pIdx = pBegin; | |||||
SCAN: while (ptr < end) { | |||||
final int tIdx = hash(b, ptr); | |||||
if (pIdx < pEnd) { | |||||
final long priorRec = pCommon[pIdx]; | |||||
if (ptr == bOf(priorRec)) { | |||||
// We know this region is unique from a prior pass. | |||||
// Insert the start point, and skip right to the end. | |||||
// | |||||
insertB(tIdx, ptr); | |||||
pIdx++; | |||||
ptr = aOfRaw(priorRec); | |||||
continue SCAN; | |||||
} | |||||
} | |||||
// We aren't sure what the status of this element is. Add | |||||
// it to our hashtable, and flag it as duplicate if there | |||||
// was already a different entry present. | |||||
// | |||||
for (int eIdx = table[tIdx]; eIdx != 0; eIdx = next[eIdx]) { | |||||
final long rec = ptrs[eIdx]; | |||||
if (cmp.equals(b, ptr, b, bOf(rec))) { | |||||
ptrs[eIdx] = rec | B_DUPLICATE; | |||||
ptr++; | |||||
continue SCAN; | |||||
} | |||||
} | |||||
insertB(tIdx, ptr); | |||||
ptr++; | |||||
} | |||||
} | |||||
private void insertB(final int tIdx, int ptr) { | |||||
final int eIdx = ++entryCnt; | |||||
ptrs[eIdx] = ((long) ptr) << B_SHIFT; | |||||
next[eIdx] = table[tIdx]; | |||||
table[tIdx] = eIdx; | |||||
} | |||||
/** | |||||
* Index elements in sequence A for later matching. | |||||
* | |||||
* This is the second stage of preparing an index to find the longest common | |||||
* sequence. The state requires {@link #scanB()} to have been invoked first. | |||||
* | |||||
* Each element of A in the range [ptr, end) are searched for in the | |||||
* internal hashtable, to see if B has already registered a location. | |||||
* | |||||
* If prior matches were given in the constructor, these may be used to | |||||
* fast-forward through sections of A to avoid unnecessary recomputation. | |||||
*/ | |||||
private void scanA() { | |||||
int ptr = region.beginA; | |||||
final int end = region.endA; | |||||
int pLast = pBegin; | |||||
SCAN: while (ptr < end) { | |||||
final int tIdx = hash(a, ptr); | |||||
for (int eIdx = table[tIdx]; eIdx != 0; eIdx = next[eIdx]) { | |||||
final long rec = ptrs[eIdx]; | |||||
final int bs = bOf(rec); | |||||
if (isDuplicate(rec) || !cmp.equals(a, ptr, b, bs)) | |||||
continue; | |||||
final int aPtr = aOfRaw(rec); | |||||
if (aPtr != 0 && cmp.equals(a, ptr, a, aPtr - 1)) { | |||||
ptrs[eIdx] = rec | A_DUPLICATE; | |||||
uniqueCommonCnt--; | |||||
ptr++; | |||||
continue SCAN; | |||||
} | |||||
// This element is both common and unique. Link the | |||||
// two sequences together at this point. | |||||
// | |||||
ptrs[eIdx] = rec | (((long) (ptr + 1)) << A_SHIFT); | |||||
uniqueCommonCnt++; | |||||
if (pBegin < pEnd) { | |||||
// If we have prior match point data, we might be able | |||||
// to locate the length of the match and skip past all | |||||
// of those elements. We try to take advantage of the | |||||
// fact that pCommon is sorted by B, and its likely that | |||||
// matches in A appear in the same order as they do in B. | |||||
// | |||||
for (int pIdx = pLast;;) { | |||||
final long priorRec = pCommon[pIdx]; | |||||
final int priorB = bOf(priorRec); | |||||
if (bs < priorB) | |||||
break; | |||||
if (bs == priorB) { | |||||
ptr += aOfRaw(priorRec) - priorB; | |||||
pLast = pIdx; | |||||
continue SCAN; | |||||
} | |||||
pIdx++; | |||||
if (pIdx == pEnd) | |||||
pIdx = pBegin; | |||||
if (pIdx == pLast) | |||||
break; | |||||
} | |||||
} | |||||
ptr++; | |||||
continue SCAN; | |||||
} | |||||
ptr++; | |||||
} | |||||
} | |||||
/** | |||||
* Scan all potential matches and find the longest common sequence. | |||||
* | |||||
* If this method returns non-null, the caller should copy out the | |||||
* {@link #nCommon} array and pass that through to the recursive sub-steps | |||||
* so that existing common matches can be reused rather than recomputed. | |||||
* | |||||
* @return an edit covering the longest common sequence. Null if there are | |||||
* no common unique sequences present. | |||||
*/ | |||||
Edit findLongestCommonSequence() { | |||||
scanB(); | |||||
scanA(); | |||||
if (uniqueCommonCnt == 0) | |||||
return null; | |||||
nCommon = new long[uniqueCommonCnt]; | |||||
int pIdx = pBegin; | |||||
Edit lcs = new Edit(0, 0); | |||||
MATCH: for (int eIdx = 1; eIdx <= entryCnt; eIdx++) { | |||||
final long rec = ptrs[eIdx]; | |||||
if (isDuplicate(rec) || aOfRaw(rec) == 0) | |||||
continue; | |||||
int bs = bOf(rec); | |||||
if (bs < lcs.endB) | |||||
continue; | |||||
int as = aOf(rec); | |||||
if (pIdx < pEnd) { | |||||
final long priorRec = pCommon[pIdx]; | |||||
if (bs == bOf(priorRec)) { | |||||
// We had a prior match and we know its unique. | |||||
// Reuse its region rather than computing again. | |||||
// | |||||
int be = aOfRaw(priorRec); | |||||
if (lcs.getLengthB() < be - bs) { | |||||
as -= bOf(rec) - bs; | |||||
lcs.beginA = as; | |||||
lcs.beginB = bs; | |||||
lcs.endA = as + (be - bs); | |||||
lcs.endB = be; | |||||
cIdx = nCnt; | |||||
} | |||||
nCommon[nCnt] = priorRec; | |||||
if (++nCnt == uniqueCommonCnt) | |||||
break MATCH; | |||||
pIdx++; | |||||
continue MATCH; | |||||
} | |||||
} | |||||
// We didn't have prior match data, or this is the first time | |||||
// seeing this particular pair. Extend the region as large as | |||||
// possible and remember it for future use. | |||||
// | |||||
int ae = as + 1; | |||||
int be = bs + 1; | |||||
while (region.beginA < as && region.beginB < bs | |||||
&& cmp.equals(a, as - 1, b, bs - 1)) { | |||||
as--; | |||||
bs--; | |||||
} | |||||
while (ae < region.endA && be < region.endB | |||||
&& cmp.equals(a, ae, b, be)) { | |||||
ae++; | |||||
be++; | |||||
} | |||||
if (lcs.getLengthB() < be - bs) { | |||||
lcs.beginA = as; | |||||
lcs.beginB = bs; | |||||
lcs.endA = ae; | |||||
lcs.endB = be; | |||||
cIdx = nCnt; | |||||
} | |||||
nCommon[nCnt] = (((long) bs) << B_SHIFT) | (((long) be) << A_SHIFT); | |||||
if (++nCnt == uniqueCommonCnt) | |||||
break MATCH; | |||||
} | |||||
return lcs; | |||||
} | |||||
private int hash(HashedSequence<S> s, int idx) { | |||||
return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift; | |||||
} | |||||
private static boolean isDuplicate(long rec) { | |||||
return (((int) rec) & DUPLICATE_MASK) != 0; | |||||
} | |||||
private static int aOfRaw(long rec) { | |||||
return ((int) (rec >>> A_SHIFT)) & PTR_MASK; | |||||
} | |||||
private static int aOf(long rec) { | |||||
return aOfRaw(rec) - 1; | |||||
} | |||||
private static int bOf(long rec) { | |||||
return (int) (rec >>> B_SHIFT); | |||||
} | |||||
private static int tableBits(final int sz) { | |||||
int bits = 31 - Integer.numberOfLeadingZeros(sz); | |||||
if (bits == 0) | |||||
bits = 1; | |||||
if (1 << bits < sz) | |||||
bits++; | |||||
return bits; | |||||
} | |||||
} |