Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

HistogramDiffIndex.java 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. /*
  2. * Copyright (C) 2010, Google Inc.
  3. * and other copyright owners as documented in the project's IP log.
  4. *
  5. * This program and the accompanying materials are made available
  6. * under the terms of the Eclipse Distribution License v1.0 which
  7. * accompanies this distribution, is reproduced below, and is
  8. * available at http://www.eclipse.org/org/documents/edl-v10.php
  9. *
  10. * All rights reserved.
  11. *
  12. * Redistribution and use in source and binary forms, with or
  13. * without modification, are permitted provided that the following
  14. * conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above copyright
  17. * notice, this list of conditions and the following disclaimer.
  18. *
  19. * - Redistributions in binary form must reproduce the above
  20. * copyright notice, this list of conditions and the following
  21. * disclaimer in the documentation and/or other materials provided
  22. * with the distribution.
  23. *
  24. * - Neither the name of the Eclipse Foundation, Inc. nor the
  25. * names of its contributors may be used to endorse or promote
  26. * products derived from this software without specific prior
  27. * written permission.
  28. *
  29. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  30. * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  31. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  32. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  33. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  34. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  35. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  36. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  37. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  38. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  39. * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  40. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  41. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  42. */
  43. package org.eclipse.jgit.diff;
  44. import org.eclipse.jgit.internal.JGitText;
  45. /**
  46. * Support {@link HistogramDiff} by computing occurrence counts of elements.
  47. *
  48. * Each element in the range being considered is put into a hash table, tracking
  49. * the number of times that distinct element appears in the sequence. Once all
  50. * elements have been inserted from sequence A, each element of sequence B is
  51. * probed in the hash table and the longest common subsequence with the lowest
  52. * occurrence count in A is used as the result.
  53. *
  54. * @param <S>
  55. * type of the base sequence.
  56. */
  57. final class HistogramDiffIndex<S extends Sequence> {
  58. private static final int REC_NEXT_SHIFT = 28 + 8;
  59. private static final int REC_PTR_SHIFT = 8;
  60. private static final int REC_PTR_MASK = (1 << 28) - 1;
  61. private static final int REC_CNT_MASK = (1 << 8) - 1;
  62. private static final int MAX_PTR = REC_PTR_MASK;
  63. private static final int MAX_CNT = (1 << 8) - 1;
  64. private final int maxChainLength;
  65. private final HashedSequenceComparator<S> cmp;
  66. private final HashedSequence<S> a;
  67. private final HashedSequence<S> b;
  68. private final Edit region;
  69. /** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
  70. private final int[] table;
  71. /** Number of low bits to discard from a key to index {@link #table}. */
  72. private final int keyShift;
  73. /**
  74. * Describes a unique element in sequence A.
  75. *
  76. * The records in this table are actually 3-tuples of:
  77. * <ul>
  78. * <li>index of next record in this table that has same hash code</li>
  79. * <li>index of first element in this occurrence chain</li>
  80. * <li>occurrence count for this element (length of locs list)</li>
  81. * </ul>
  82. *
  83. * The occurrence count is capped at {@link #MAX_CNT}, as the field is only
  84. * a few bits wide. Elements that occur more frequently will have their
  85. * count capped.
  86. */
  87. private long[] recs;
  88. /** Number of elements in {@link #recs}; also is the unique element count. */
  89. private int recCnt;
  90. /**
  91. * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
  92. *
  93. * For the sequence element {@code ptr}, the value stored at location
  94. * {@code next[ptr - ptrShift]} is the next occurrence of the exact same
  95. * element in the sequence.
  96. *
  97. * Chains always run from the lowest index to the largest index. Therefore
  98. * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
  99. * This allows a chain to terminate with {@code 0}, as {@code 0} would never
  100. * be a valid next element.
  101. *
  102. * The array is sized to be {@code region.getLengthA()} and element indexes
  103. * are converted to array indexes by subtracting {@link #ptrShift}, which is
  104. * just a cached version of {@code region.beginA}.
  105. */
  106. private int[] next;
  107. /**
  108. * For element {@code ptr} in A, index of the record in {@link #recs} array.
  109. *
  110. * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
  111. * describing all occurrences of the element appearing in sequence A at
  112. * position {@code ptr}. The record is needed to get the occurrence count of
  113. * the element, or to locate all other occurrences of that element within
  114. * sequence A. This index provides constant-time access to the record, and
  115. * avoids needing to scan the hash chain.
  116. */
  117. private int[] recIdx;
  118. /** Value to subtract from element indexes to key {@link #next} array. */
  119. private int ptrShift;
  120. private Edit lcs;
  121. private int cnt;
  122. private boolean hasCommon;
  123. HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
  124. HashedSequence<S> a, HashedSequence<S> b, Edit r) {
  125. this.maxChainLength = maxChainLength;
  126. this.cmp = cmp;
  127. this.a = a;
  128. this.b = b;
  129. this.region = r;
  130. if (region.endA >= MAX_PTR)
  131. throw new IllegalArgumentException(
  132. JGitText.get().sequenceTooLargeForDiffAlgorithm);
  133. final int sz = r.getLengthA();
  134. final int tableBits = tableBits(sz);
  135. table = new int[1 << tableBits];
  136. keyShift = 32 - tableBits;
  137. ptrShift = r.beginA;
  138. recs = new long[Math.max(4, sz >>> 3)];
  139. next = new int[sz];
  140. recIdx = new int[sz];
  141. }
  142. Edit findLongestCommonSequence() {
  143. if (!scanA())
  144. return null;
  145. lcs = new Edit(0, 0);
  146. cnt = maxChainLength + 1;
  147. for (int bPtr = region.beginB; bPtr < region.endB;)
  148. bPtr = tryLongestCommonSequence(bPtr);
  149. return hasCommon && maxChainLength < cnt ? null : lcs;
  150. }
  151. private boolean scanA() {
  152. // Scan the elements backwards, inserting them into the hash table
  153. // as we go. Going in reverse places the earliest occurrence of any
  154. // element at the start of the chain, so we consider earlier matches
  155. // before later matches.
  156. //
  157. SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
  158. final int tIdx = hash(a, ptr);
  159. int chainLen = 0;
  160. for (int rIdx = table[tIdx]; rIdx != 0;) {
  161. final long rec = recs[rIdx];
  162. if (cmp.equals(a, recPtr(rec), a, ptr)) {
  163. // ptr is identical to another element. Insert it onto
  164. // the front of the existing element chain.
  165. //
  166. int newCnt = recCnt(rec) + 1;
  167. if (MAX_CNT < newCnt)
  168. newCnt = MAX_CNT;
  169. recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
  170. next[ptr - ptrShift] = recPtr(rec);
  171. recIdx[ptr - ptrShift] = rIdx;
  172. continue SCAN;
  173. }
  174. rIdx = recNext(rec);
  175. chainLen++;
  176. }
  177. if (chainLen == maxChainLength)
  178. return false;
  179. // This is the first time we have ever seen this particular
  180. // element in the sequence. Construct a new chain for it.
  181. //
  182. final int rIdx = ++recCnt;
  183. if (rIdx == recs.length) {
  184. int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
  185. long[] n = new long[sz];
  186. System.arraycopy(recs, 0, n, 0, recs.length);
  187. recs = n;
  188. }
  189. recs[rIdx] = recCreate(table[tIdx], ptr, 1);
  190. recIdx[ptr - ptrShift] = rIdx;
  191. table[tIdx] = rIdx;
  192. }
  193. return true;
  194. }
  195. private int tryLongestCommonSequence(final int bPtr) {
  196. int bNext = bPtr + 1;
  197. int rIdx = table[hash(b, bPtr)];
  198. for (long rec; rIdx != 0; rIdx = recNext(rec)) {
  199. rec = recs[rIdx];
  200. // If there are more occurrences in A, don't use this chain.
  201. if (recCnt(rec) > cnt) {
  202. if (!hasCommon)
  203. hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
  204. continue;
  205. }
  206. int as = recPtr(rec);
  207. if (!cmp.equals(a, as, b, bPtr))
  208. continue;
  209. hasCommon = true;
  210. TRY_LOCATIONS: for (;;) {
  211. int np = next[as - ptrShift];
  212. int bs = bPtr;
  213. int ae = as + 1;
  214. int be = bs + 1;
  215. int rc = recCnt(rec);
  216. while (region.beginA < as && region.beginB < bs
  217. && cmp.equals(a, as - 1, b, bs - 1)) {
  218. as--;
  219. bs--;
  220. if (1 < rc)
  221. rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
  222. }
  223. while (ae < region.endA && be < region.endB
  224. && cmp.equals(a, ae, b, be)) {
  225. if (1 < rc)
  226. rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
  227. ae++;
  228. be++;
  229. }
  230. if (bNext < be)
  231. bNext = be;
  232. if (lcs.getLengthA() < ae - as || rc < cnt) {
  233. // If this region is the longest, or there are less
  234. // occurrences of it in A, its now our LCS.
  235. //
  236. lcs.beginA = as;
  237. lcs.beginB = bs;
  238. lcs.endA = ae;
  239. lcs.endB = be;
  240. cnt = rc;
  241. }
  242. // Because we added elements in reverse order index 0
  243. // cannot possibly be the next position. Its the first
  244. // element of the sequence and thus would have been the
  245. // value of as at the start of the TRY_LOCATIONS loop.
  246. //
  247. if (np == 0)
  248. break TRY_LOCATIONS;
  249. while (np < ae) {
  250. // The next location to consider was actually within
  251. // the LCS we examined above. Don't reconsider it.
  252. //
  253. np = next[np - ptrShift];
  254. if (np == 0)
  255. break TRY_LOCATIONS;
  256. }
  257. as = np;
  258. }
  259. }
  260. return bNext;
  261. }
  262. private int hash(HashedSequence<S> s, int idx) {
  263. return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
  264. }
  265. private static long recCreate(int next, int ptr, int cnt) {
  266. return ((long) next << REC_NEXT_SHIFT) //
  267. | ((long) ptr << REC_PTR_SHIFT) //
  268. | cnt;
  269. }
  270. private static int recNext(long rec) {
  271. return (int) (rec >>> REC_NEXT_SHIFT);
  272. }
  273. private static int recPtr(long rec) {
  274. return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
  275. }
  276. private static int recCnt(long rec) {
  277. return ((int) rec) & REC_CNT_MASK;
  278. }
  279. private static int tableBits(final int sz) {
  280. int bits = 31 - Integer.numberOfLeadingZeros(sz);
  281. if (bits == 0)
  282. bits = 1;
  283. if (1 << bits < sz)
  284. bits++;
  285. return bits;
  286. }
  287. }