You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

HistogramDiffIndex.java 8.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. /*
  2. * Copyright (C) 2010, Google Inc. and others
  3. *
  4. * This program and the accompanying materials are made available under the
  5. * terms of the Eclipse Distribution License v. 1.0 which is available at
  6. * https://www.eclipse.org/org/documents/edl-v10.php.
  7. *
  8. * SPDX-License-Identifier: BSD-3-Clause
  9. */
  10. package org.eclipse.jgit.diff;
  11. import org.eclipse.jgit.internal.JGitText;
  12. /**
  13. * Support {@link HistogramDiff} by computing occurrence counts of elements.
  14. * <p>
  15. * Each element in the range being considered is put into a hash table, tracking
  16. * the number of times that distinct element appears in the sequence. Once all
  17. * elements have been inserted from sequence A, each element of sequence B is
  18. * probed in the hash table and the longest common subsequence with the lowest
  19. * occurrence count in A is used as the result.
  20. *
  21. * @param <S>
  22. * type of the base sequence.
  23. */
  24. final class HistogramDiffIndex<S extends Sequence> {
  25. private static final int REC_NEXT_SHIFT = 28 + 8;
  26. private static final int REC_PTR_SHIFT = 8;
  27. private static final int REC_PTR_MASK = (1 << 28) - 1;
  28. private static final int REC_CNT_MASK = (1 << 8) - 1;
  29. private static final int MAX_PTR = REC_PTR_MASK;
  30. private static final int MAX_CNT = (1 << 8) - 1;
  31. private final int maxChainLength;
  32. private final HashedSequenceComparator<S> cmp;
  33. private final HashedSequence<S> a;
  34. private final HashedSequence<S> b;
  35. private final Edit region;
  36. /** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
  37. private final int[] table;
  38. /** Number of low bits to discard from a key to index {@link #table}. */
  39. private final int keyShift;
  40. /**
  41. * Describes a unique element in sequence A.
  42. *
  43. * The records in this table are actually 3-tuples of:
  44. * <ul>
  45. * <li>index of next record in this table that has same hash code</li>
  46. * <li>index of first element in this occurrence chain</li>
  47. * <li>occurrence count for this element (length of locs list)</li>
  48. * </ul>
  49. *
  50. * The occurrence count is capped at {@link #MAX_CNT}, as the field is only
  51. * a few bits wide. Elements that occur more frequently will have their
  52. * count capped.
  53. */
  54. private long[] recs;
  55. /** Number of elements in {@link #recs}; also is the unique element count. */
  56. private int recCnt;
  57. /**
  58. * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
  59. *
  60. * For the sequence element {@code ptr}, the value stored at location
  61. * {@code next[ptr - ptrShift]} is the next occurrence of the exact same
  62. * element in the sequence.
  63. *
  64. * Chains always run from the lowest index to the largest index. Therefore
  65. * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
  66. * This allows a chain to terminate with {@code 0}, as {@code 0} would never
  67. * be a valid next element.
  68. *
  69. * The array is sized to be {@code region.getLengthA()} and element indexes
  70. * are converted to array indexes by subtracting {@link #ptrShift}, which is
  71. * just a cached version of {@code region.beginA}.
  72. */
  73. private int[] next;
  74. /**
  75. * For element {@code ptr} in A, index of the record in {@link #recs} array.
  76. *
  77. * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
  78. * describing all occurrences of the element appearing in sequence A at
  79. * position {@code ptr}. The record is needed to get the occurrence count of
  80. * the element, or to locate all other occurrences of that element within
  81. * sequence A. This index provides constant-time access to the record, and
  82. * avoids needing to scan the hash chain.
  83. */
  84. private int[] recIdx;
  85. /** Value to subtract from element indexes to key {@link #next} array. */
  86. private int ptrShift;
  87. private Edit lcs;
  88. private int cnt;
  89. private boolean hasCommon;
  90. HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
  91. HashedSequence<S> a, HashedSequence<S> b, Edit r) {
  92. this.maxChainLength = maxChainLength;
  93. this.cmp = cmp;
  94. this.a = a;
  95. this.b = b;
  96. this.region = r;
  97. if (region.endA >= MAX_PTR)
  98. throw new IllegalArgumentException(
  99. JGitText.get().sequenceTooLargeForDiffAlgorithm);
  100. final int sz = r.getLengthA();
  101. final int tableBits = tableBits(sz);
  102. table = new int[1 << tableBits];
  103. keyShift = 32 - tableBits;
  104. ptrShift = r.beginA;
  105. recs = new long[Math.max(4, sz >>> 3)];
  106. next = new int[sz];
  107. recIdx = new int[sz];
  108. }
  109. Edit findLongestCommonSequence() {
  110. if (!scanA())
  111. return null;
  112. lcs = new Edit(0, 0);
  113. cnt = maxChainLength + 1;
  114. for (int bPtr = region.beginB; bPtr < region.endB;)
  115. bPtr = tryLongestCommonSequence(bPtr);
  116. return hasCommon && maxChainLength < cnt ? null : lcs;
  117. }
  118. private boolean scanA() {
  119. // Scan the elements backwards, inserting them into the hash table
  120. // as we go. Going in reverse places the earliest occurrence of any
  121. // element at the start of the chain, so we consider earlier matches
  122. // before later matches.
  123. //
  124. SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
  125. final int tIdx = hash(a, ptr);
  126. int chainLen = 0;
  127. for (int rIdx = table[tIdx]; rIdx != 0;) {
  128. final long rec = recs[rIdx];
  129. if (cmp.equals(a, recPtr(rec), a, ptr)) {
  130. // ptr is identical to another element. Insert it onto
  131. // the front of the existing element chain.
  132. //
  133. int newCnt = recCnt(rec) + 1;
  134. if (MAX_CNT < newCnt)
  135. newCnt = MAX_CNT;
  136. recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
  137. next[ptr - ptrShift] = recPtr(rec);
  138. recIdx[ptr - ptrShift] = rIdx;
  139. continue SCAN;
  140. }
  141. rIdx = recNext(rec);
  142. chainLen++;
  143. }
  144. if (chainLen == maxChainLength)
  145. return false;
  146. // This is the first time we have ever seen this particular
  147. // element in the sequence. Construct a new chain for it.
  148. //
  149. final int rIdx = ++recCnt;
  150. if (rIdx == recs.length) {
  151. int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
  152. long[] n = new long[sz];
  153. System.arraycopy(recs, 0, n, 0, recs.length);
  154. recs = n;
  155. }
  156. recs[rIdx] = recCreate(table[tIdx], ptr, 1);
  157. recIdx[ptr - ptrShift] = rIdx;
  158. table[tIdx] = rIdx;
  159. }
  160. return true;
  161. }
  162. private int tryLongestCommonSequence(int bPtr) {
  163. int bNext = bPtr + 1;
  164. int rIdx = table[hash(b, bPtr)];
  165. for (long rec; rIdx != 0; rIdx = recNext(rec)) {
  166. rec = recs[rIdx];
  167. // If there are more occurrences in A, don't use this chain.
  168. if (recCnt(rec) > cnt) {
  169. if (!hasCommon)
  170. hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
  171. continue;
  172. }
  173. int as = recPtr(rec);
  174. if (!cmp.equals(a, as, b, bPtr))
  175. continue;
  176. hasCommon = true;
  177. TRY_LOCATIONS: for (;;) {
  178. int np = next[as - ptrShift];
  179. int bs = bPtr;
  180. int ae = as + 1;
  181. int be = bs + 1;
  182. int rc = recCnt(rec);
  183. while (region.beginA < as && region.beginB < bs
  184. && cmp.equals(a, as - 1, b, bs - 1)) {
  185. as--;
  186. bs--;
  187. if (1 < rc)
  188. rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
  189. }
  190. while (ae < region.endA && be < region.endB
  191. && cmp.equals(a, ae, b, be)) {
  192. if (1 < rc)
  193. rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
  194. ae++;
  195. be++;
  196. }
  197. if (bNext < be)
  198. bNext = be;
  199. if (lcs.getLengthA() < ae - as || rc < cnt) {
  200. // If this region is the longest, or there are less
  201. // occurrences of it in A, its now our LCS.
  202. //
  203. lcs.beginA = as;
  204. lcs.beginB = bs;
  205. lcs.endA = ae;
  206. lcs.endB = be;
  207. cnt = rc;
  208. }
  209. // Because we added elements in reverse order index 0
  210. // cannot possibly be the next position. Its the first
  211. // element of the sequence and thus would have been the
  212. // value of as at the start of the TRY_LOCATIONS loop.
  213. //
  214. if (np == 0)
  215. break TRY_LOCATIONS;
  216. while (np < ae) {
  217. // The next location to consider was actually within
  218. // the LCS we examined above. Don't reconsider it.
  219. //
  220. np = next[np - ptrShift];
  221. if (np == 0)
  222. break TRY_LOCATIONS;
  223. }
  224. as = np;
  225. }
  226. }
  227. return bNext;
  228. }
  229. private int hash(HashedSequence<S> s, int idx) {
  230. return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
  231. }
  232. private static long recCreate(int next, int ptr, int cnt) {
  233. return ((long) next << REC_NEXT_SHIFT) //
  234. | ((long) ptr << REC_PTR_SHIFT) //
  235. | cnt;
  236. }
  237. private static int recNext(long rec) {
  238. return (int) (rec >>> REC_NEXT_SHIFT);
  239. }
  240. private static int recPtr(long rec) {
  241. return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
  242. }
  243. private static int recCnt(long rec) {
  244. return ((int) rec) & REC_CNT_MASK;
  245. }
  246. private static int tableBits(int sz) {
  247. int bits = 31 - Integer.numberOfLeadingZeros(sz);
  248. if (bits == 0)
  249. bits = 1;
  250. if (1 << bits < sz)
  251. bits++;
  252. return bits;
  253. }
  254. }