123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339 |
- /*
- * Copyright (C) 2010, Google Inc.
- * and other copyright owners as documented in the project's IP log.
- *
- * This program and the accompanying materials are made available
- * under the terms of the Eclipse Distribution License v1.0 which
- * accompanies this distribution, is reproduced below, and is
- * available at http://www.eclipse.org/org/documents/edl-v10.php
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials provided
- * with the distribution.
- *
- * - Neither the name of the Eclipse Foundation, Inc. nor the
- * names of its contributors may be used to endorse or promote
- * products derived from this software without specific prior
- * written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
- * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
- package org.eclipse.jgit.diff;
-
- import org.eclipse.jgit.internal.JGitText;
-
- /**
- * Support {@link HistogramDiff} by computing occurrence counts of elements.
- *
- * Each element in the range being considered is put into a hash table, tracking
- * the number of times that distinct element appears in the sequence. Once all
- * elements have been inserted from sequence A, each element of sequence B is
- * probed in the hash table and the longest common subsequence with the lowest
- * occurrence count in A is used as the result.
- *
- * @param <S>
- * type of the base sequence.
- */
- final class HistogramDiffIndex<S extends Sequence> {
- private static final int REC_NEXT_SHIFT = 28 + 8;
-
- private static final int REC_PTR_SHIFT = 8;
-
- private static final int REC_PTR_MASK = (1 << 28) - 1;
-
- private static final int REC_CNT_MASK = (1 << 8) - 1;
-
- private static final int MAX_PTR = REC_PTR_MASK;
-
- private static final int MAX_CNT = (1 << 8) - 1;
-
- private final int maxChainLength;
-
- private final HashedSequenceComparator<S> cmp;
-
- private final HashedSequence<S> a;
-
- private final HashedSequence<S> b;
-
- private final Edit region;
-
- /** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
- private final int[] table;
-
- /** Number of low bits to discard from a key to index {@link #table}. */
- private final int keyShift;
-
- /**
- * Describes a unique element in sequence A.
- *
- * The records in this table are actually 3-tuples of:
- * <ul>
- * <li>index of next record in this table that has same hash code</li>
- * <li>index of first element in this occurrence chain</li>
- * <li>occurrence count for this element (length of locs list)</li>
- * </ul>
- *
- * The occurrence count is capped at {@link #MAX_CNT}, as the field is only
- * a few bits wide. Elements that occur more frequently will have their
- * count capped.
- */
- private long[] recs;
-
- /** Number of elements in {@link #recs}; also is the unique element count. */
- private int recCnt;
-
- /**
- * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
- *
- * For the sequence element {@code ptr}, the value stored at location
- * {@code next[ptr - ptrShift]} is the next occurrence of the exact same
- * element in the sequence.
- *
- * Chains always run from the lowest index to the largest index. Therefore
- * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
- * This allows a chain to terminate with {@code 0}, as {@code 0} would never
- * be a valid next element.
- *
- * The array is sized to be {@code region.getLengthA()} and element indexes
- * are converted to array indexes by subtracting {@link #ptrShift}, which is
- * just a cached version of {@code region.beginA}.
- */
- private int[] next;
-
- /**
- * For element {@code ptr} in A, index of the record in {@link #recs} array.
- *
- * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
- * describing all occurrences of the element appearing in sequence A at
- * position {@code ptr}. The record is needed to get the occurrence count of
- * the element, or to locate all other occurrences of that element within
- * sequence A. This index provides constant-time access to the record, and
- * avoids needing to scan the hash chain.
- */
- private int[] recIdx;
-
- /** Value to subtract from element indexes to key {@link #next} array. */
- private int ptrShift;
-
- private Edit lcs;
-
- private int cnt;
-
- private boolean hasCommon;
-
- HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
- HashedSequence<S> a, HashedSequence<S> b, Edit r) {
- this.maxChainLength = maxChainLength;
- this.cmp = cmp;
- this.a = a;
- this.b = b;
- this.region = r;
-
- if (region.endA >= MAX_PTR)
- throw new IllegalArgumentException(
- JGitText.get().sequenceTooLargeForDiffAlgorithm);
-
- final int sz = r.getLengthA();
- final int tableBits = tableBits(sz);
- table = new int[1 << tableBits];
- keyShift = 32 - tableBits;
- ptrShift = r.beginA;
-
- recs = new long[Math.max(4, sz >>> 3)];
- next = new int[sz];
- recIdx = new int[sz];
- }
-
- Edit findLongestCommonSequence() {
- if (!scanA())
- return null;
-
- lcs = new Edit(0, 0);
- cnt = maxChainLength + 1;
-
- for (int bPtr = region.beginB; bPtr < region.endB;)
- bPtr = tryLongestCommonSequence(bPtr);
-
- return hasCommon && maxChainLength < cnt ? null : lcs;
- }
-
- private boolean scanA() {
- // Scan the elements backwards, inserting them into the hash table
- // as we go. Going in reverse places the earliest occurrence of any
- // element at the start of the chain, so we consider earlier matches
- // before later matches.
- //
- SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
- final int tIdx = hash(a, ptr);
-
- int chainLen = 0;
- for (int rIdx = table[tIdx]; rIdx != 0;) {
- final long rec = recs[rIdx];
- if (cmp.equals(a, recPtr(rec), a, ptr)) {
- // ptr is identical to another element. Insert it onto
- // the front of the existing element chain.
- //
- int newCnt = recCnt(rec) + 1;
- if (MAX_CNT < newCnt)
- newCnt = MAX_CNT;
- recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
- next[ptr - ptrShift] = recPtr(rec);
- recIdx[ptr - ptrShift] = rIdx;
- continue SCAN;
- }
-
- rIdx = recNext(rec);
- chainLen++;
- }
-
- if (chainLen == maxChainLength)
- return false;
-
- // This is the first time we have ever seen this particular
- // element in the sequence. Construct a new chain for it.
- //
- final int rIdx = ++recCnt;
- if (rIdx == recs.length) {
- int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
- long[] n = new long[sz];
- System.arraycopy(recs, 0, n, 0, recs.length);
- recs = n;
- }
-
- recs[rIdx] = recCreate(table[tIdx], ptr, 1);
- recIdx[ptr - ptrShift] = rIdx;
- table[tIdx] = rIdx;
- }
- return true;
- }
-
- private int tryLongestCommonSequence(final int bPtr) {
- int bNext = bPtr + 1;
- int rIdx = table[hash(b, bPtr)];
- for (long rec; rIdx != 0; rIdx = recNext(rec)) {
- rec = recs[rIdx];
-
- // If there are more occurrences in A, don't use this chain.
- if (recCnt(rec) > cnt) {
- if (!hasCommon)
- hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
- continue;
- }
-
- int as = recPtr(rec);
- if (!cmp.equals(a, as, b, bPtr))
- continue;
-
- hasCommon = true;
- TRY_LOCATIONS: for (;;) {
- int np = next[as - ptrShift];
- int bs = bPtr;
- int ae = as + 1;
- int be = bs + 1;
- int rc = recCnt(rec);
-
- while (region.beginA < as && region.beginB < bs
- && cmp.equals(a, as - 1, b, bs - 1)) {
- as--;
- bs--;
- if (1 < rc)
- rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
- }
- while (ae < region.endA && be < region.endB
- && cmp.equals(a, ae, b, be)) {
- if (1 < rc)
- rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
- ae++;
- be++;
- }
-
- if (bNext < be)
- bNext = be;
- if (lcs.getLengthA() < ae - as || rc < cnt) {
- // If this region is the longest, or there are less
- // occurrences of it in A, its now our LCS.
- //
- lcs.beginA = as;
- lcs.beginB = bs;
- lcs.endA = ae;
- lcs.endB = be;
- cnt = rc;
- }
-
- // Because we added elements in reverse order index 0
- // cannot possibly be the next position. Its the first
- // element of the sequence and thus would have been the
- // value of as at the start of the TRY_LOCATIONS loop.
- //
- if (np == 0)
- break TRY_LOCATIONS;
-
- while (np < ae) {
- // The next location to consider was actually within
- // the LCS we examined above. Don't reconsider it.
- //
- np = next[np - ptrShift];
- if (np == 0)
- break TRY_LOCATIONS;
- }
-
- as = np;
- }
- }
- return bNext;
- }
-
- private int hash(HashedSequence<S> s, int idx) {
- return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
- }
-
- private static long recCreate(int next, int ptr, int cnt) {
- return ((long) next << REC_NEXT_SHIFT) //
- | ((long) ptr << REC_PTR_SHIFT) //
- | cnt;
- }
-
- private static int recNext(long rec) {
- return (int) (rec >>> REC_NEXT_SHIFT);
- }
-
- private static int recPtr(long rec) {
- return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
- }
-
- private static int recCnt(long rec) {
- return ((int) rec) & REC_CNT_MASK;
- }
-
- private static int tableBits(final int sz) {
- int bits = 31 - Integer.numberOfLeadingZeros(sz);
- if (bits == 0)
- bits = 1;
- if (1 << bits < sz)
- bits++;
- return bits;
- }
- }
|