You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

SimilarityIndex.java 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. /*
  2. * Copyright (C) 2010, Google Inc. and others
  3. *
  4. * This program and the accompanying materials are made available under the
  5. * terms of the Eclipse Distribution License v. 1.0 which is available at
  6. * https://www.eclipse.org/org/documents/edl-v10.php.
  7. *
  8. * SPDX-License-Identifier: BSD-3-Clause
  9. */
  10. package org.eclipse.jgit.diff;
  11. import java.io.EOFException;
  12. import java.io.IOException;
  13. import java.io.InputStream;
  14. import java.util.Arrays;
  15. import org.eclipse.jgit.errors.MissingObjectException;
  16. import org.eclipse.jgit.lib.ObjectLoader;
  17. import org.eclipse.jgit.lib.ObjectStream;
  18. /**
  19. * Index structure of lines/blocks in one file.
  20. * <p>
  21. * This structure can be used to compute an approximation of the similarity
  22. * between two files. The index is used by
  23. * {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores
  24. * between files.
  25. * <p>
  26. * To save space in memory, this index uses a space efficient encoding which
  27. * will not exceed 1 MiB per instance. The index starts out at a smaller size
  28. * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
  29. * file are discovered.
  30. *
  31. * @since 4.0
  32. */
  33. public class SimilarityIndex {
  34. /** A special {@link TableFullException} used in place of OutOfMemoryError. */
  35. public static final TableFullException
  36. TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
  37. /**
  38. * Shift to apply before storing a key.
  39. * <p>
  40. * Within the 64 bit table record space, we leave the highest bit unset so
  41. * all values are positive. The lower 32 bits to count bytes.
  42. */
  43. private static final int KEY_SHIFT = 32;
  44. /** Maximum value of the count field, also mask to extract the count. */
  45. private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
  46. /**
  47. * Total amount of bytes hashed into the structure, including \n. This is
  48. * usually the size of the file minus number of CRLF encounters.
  49. */
  50. private long hashedCnt;
  51. /** Number of non-zero entries in {@link #idHash}. */
  52. private int idSize;
  53. /** {@link #idSize} that triggers {@link #idHash} to double in size. */
  54. private int idGrowAt;
  55. /**
  56. * Pairings of content keys and counters.
  57. * <p>
  58. * Slots in the table are actually two ints wedged into a single long. The
  59. * upper 32 bits stores the content key, and the remaining lower bits stores
  60. * the number of bytes associated with that key. Empty slots are denoted by
  61. * 0, which cannot occur because the count cannot be 0. Values can only be
  62. * positive, which we enforce during key addition.
  63. */
  64. private long[] idHash;
  65. /** {@code idHash.length == 1 << idHashBits}. */
  66. private int idHashBits;
  67. /**
  68. * Create a new similarity index for the given object
  69. *
  70. * @param obj
  71. * the object to hash
  72. * @return similarity index for this object
  73. * @throws java.io.IOException
  74. * file contents cannot be read from the repository.
  75. * @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException
  76. * object hashing overflowed the storage capacity of the
  77. * SimilarityIndex.
  78. */
  79. public static SimilarityIndex create(ObjectLoader obj) throws IOException,
  80. TableFullException {
  81. SimilarityIndex idx = new SimilarityIndex();
  82. idx.hash(obj);
  83. idx.sort();
  84. return idx;
  85. }
  86. SimilarityIndex() {
  87. idHashBits = 8;
  88. idHash = new long[1 << idHashBits];
  89. idGrowAt = growAt(idHashBits);
  90. }
  91. static boolean isBinary(ObjectLoader obj) throws IOException {
  92. if (obj.isLarge()) {
  93. try (ObjectStream in1 = obj.openStream()) {
  94. return RawText.isBinary(in1);
  95. }
  96. }
  97. return RawText.isBinary(obj.getCachedBytes());
  98. }
  99. void hash(ObjectLoader obj) throws MissingObjectException, IOException,
  100. TableFullException {
  101. if (obj.isLarge()) {
  102. hashLargeObject(obj);
  103. } else {
  104. byte[] raw = obj.getCachedBytes();
  105. hash(raw, 0, raw.length);
  106. }
  107. }
  108. private void hashLargeObject(ObjectLoader obj) throws IOException,
  109. TableFullException {
  110. boolean text;
  111. text = !isBinary(obj);
  112. try (ObjectStream in2 = obj.openStream()) {
  113. hash(in2, in2.getSize(), text);
  114. }
  115. }
  116. void hash(byte[] raw, int ptr, int end) throws TableFullException {
  117. final boolean text = !RawText.isBinary(raw);
  118. hashedCnt = 0;
  119. while (ptr < end) {
  120. int hash = 5381;
  121. int blockHashedCnt = 0;
  122. int start = ptr;
  123. // Hash one line, or one block, whichever occurs first.
  124. do {
  125. int c = raw[ptr++] & 0xff;
  126. // Ignore CR in CRLF sequence if text
  127. if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
  128. continue;
  129. blockHashedCnt++;
  130. if (c == '\n')
  131. break;
  132. hash = (hash << 5) + hash + c;
  133. } while (ptr < end && ptr - start < 64);
  134. hashedCnt += blockHashedCnt;
  135. add(hash, blockHashedCnt);
  136. }
  137. }
  138. void hash(InputStream in, long remaining, boolean text) throws IOException,
  139. TableFullException {
  140. byte[] buf = new byte[4096];
  141. int ptr = 0;
  142. int cnt = 0;
  143. while (0 < remaining) {
  144. int hash = 5381;
  145. int blockHashedCnt = 0;
  146. // Hash one line, or one block, whichever occurs first.
  147. int n = 0;
  148. do {
  149. if (ptr == cnt) {
  150. ptr = 0;
  151. cnt = in.read(buf, 0, buf.length);
  152. if (cnt <= 0)
  153. throw new EOFException();
  154. }
  155. n++;
  156. int c = buf[ptr++] & 0xff;
  157. // Ignore CR in CRLF sequence if text
  158. if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
  159. continue;
  160. blockHashedCnt++;
  161. if (c == '\n')
  162. break;
  163. hash = (hash << 5) + hash + c;
  164. } while (n < 64 && n < remaining);
  165. hashedCnt += blockHashedCnt;
  166. add(hash, blockHashedCnt);
  167. remaining -= n;
  168. }
  169. }
  170. /**
  171. * Sort the internal table so it can be used for efficient scoring.
  172. * <p>
  173. * Once sorted, additional lines/blocks cannot be added to the index.
  174. */
  175. void sort() {
  176. // Sort the array. All of the empty space will wind up at the front,
  177. // because we forced all of the keys to always be positive. Later
  178. // we only work with the back half of the array.
  179. //
  180. Arrays.sort(idHash);
  181. }
  182. /**
  183. * Compute the similarity score between this index and another.
  184. * <p>
  185. * A region of a file is defined as a line in a text file or a fixed-size
  186. * block in a binary file. To prepare an index, each region in the file is
  187. * hashed; the values and counts of hashes are retained in a sorted table.
  188. * Define the similarity fraction F as the count of matching regions
  189. * between the two files divided between the maximum count of regions in
  190. * either file. The similarity score is F multiplied by the maxScore
  191. * constant, yielding a range [0, maxScore]. It is defined as maxScore for
  192. * the degenerate case of two empty files.
  193. * <p>
  194. * The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
  195. *
  196. * @param dst
  197. * the other index
  198. * @param maxScore
  199. * the score representing a 100% match
  200. * @return the similarity score
  201. */
  202. public int score(SimilarityIndex dst, int maxScore) {
  203. long max = Math.max(hashedCnt, dst.hashedCnt);
  204. if (max == 0)
  205. return maxScore;
  206. return (int) ((common(dst) * maxScore) / max);
  207. }
  208. long common(SimilarityIndex dst) {
  209. return common(this, dst);
  210. }
  211. private static long common(SimilarityIndex src, SimilarityIndex dst) {
  212. int srcIdx = src.packedIndex(0);
  213. int dstIdx = dst.packedIndex(0);
  214. long[] srcHash = src.idHash;
  215. long[] dstHash = dst.idHash;
  216. return common(srcHash, srcIdx, dstHash, dstIdx);
  217. }
  218. private static long common(long[] srcHash, int srcIdx, //
  219. long[] dstHash, int dstIdx) {
  220. if (srcIdx == srcHash.length || dstIdx == dstHash.length)
  221. return 0;
  222. long common = 0;
  223. int srcKey = keyOf(srcHash[srcIdx]);
  224. int dstKey = keyOf(dstHash[dstIdx]);
  225. for (;;) {
  226. if (srcKey == dstKey) {
  227. common += Math.min(countOf(srcHash[srcIdx]),
  228. countOf(dstHash[dstIdx]));
  229. if (++srcIdx == srcHash.length)
  230. break;
  231. srcKey = keyOf(srcHash[srcIdx]);
  232. if (++dstIdx == dstHash.length)
  233. break;
  234. dstKey = keyOf(dstHash[dstIdx]);
  235. } else if (srcKey < dstKey) {
  236. // Regions of src which do not appear in dst.
  237. if (++srcIdx == srcHash.length)
  238. break;
  239. srcKey = keyOf(srcHash[srcIdx]);
  240. } else /* if (dstKey < srcKey) */{
  241. // Regions of dst which do not appear in src.
  242. if (++dstIdx == dstHash.length)
  243. break;
  244. dstKey = keyOf(dstHash[dstIdx]);
  245. }
  246. }
  247. return common;
  248. }
  249. // Testing only
  250. int size() {
  251. return idSize;
  252. }
  253. // Testing only
  254. int key(int idx) {
  255. return keyOf(idHash[packedIndex(idx)]);
  256. }
  257. // Testing only
  258. long count(int idx) {
  259. return countOf(idHash[packedIndex(idx)]);
  260. }
  261. // Brute force approach only for testing.
  262. int findIndex(int key) {
  263. for (int i = 0; i < idSize; i++)
  264. if (key(i) == key)
  265. return i;
  266. return -1;
  267. }
  268. private int packedIndex(int idx) {
  269. return (idHash.length - idSize) + idx;
  270. }
  271. void add(int key, int cnt) throws TableFullException {
  272. key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
  273. int j = slot(key);
  274. for (;;) {
  275. long v = idHash[j];
  276. if (v == 0) {
  277. // Empty slot in the table, store here.
  278. if (idGrowAt <= idSize) {
  279. grow();
  280. j = slot(key);
  281. continue;
  282. }
  283. idHash[j] = pair(key, cnt);
  284. idSize++;
  285. return;
  286. } else if (keyOf(v) == key) {
  287. // Same key, increment the counter. If it overflows, fail
  288. // indexing to prevent the key from being impacted.
  289. //
  290. idHash[j] = pair(key, countOf(v) + cnt);
  291. return;
  292. } else if (++j >= idHash.length) {
  293. j = 0;
  294. }
  295. }
  296. }
  297. private static long pair(int key, long cnt) throws TableFullException {
  298. if (MAX_COUNT < cnt)
  299. throw new TableFullException();
  300. return (((long) key) << KEY_SHIFT) | cnt;
  301. }
  302. private int slot(int key) {
  303. // We use 31 - idHashBits because the upper bit was already forced
  304. // to be 0 and we want the remaining high bits to be used as the
  305. // table slot.
  306. //
  307. return key >>> (31 - idHashBits);
  308. }
  309. private static int growAt(int idHashBits) {
  310. return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
  311. }
  312. @SuppressWarnings("UnusedException")
  313. private void grow() throws TableFullException {
  314. if (idHashBits == 30)
  315. throw new TableFullException();
  316. long[] oldHash = idHash;
  317. int oldSize = idHash.length;
  318. idHashBits++;
  319. idGrowAt = growAt(idHashBits);
  320. try {
  321. idHash = new long[1 << idHashBits];
  322. } catch (OutOfMemoryError noMemory) {
  323. throw TABLE_FULL_OUT_OF_MEMORY;
  324. }
  325. for (int i = 0; i < oldSize; i++) {
  326. long v = oldHash[i];
  327. if (v != 0) {
  328. int j = slot(keyOf(v));
  329. while (idHash[j] != 0)
  330. if (++j >= idHash.length)
  331. j = 0;
  332. idHash[j] = v;
  333. }
  334. }
  335. }
  336. private static int keyOf(long v) {
  337. return (int) (v >>> KEY_SHIFT);
  338. }
  339. private static long countOf(long v) {
  340. return v & MAX_COUNT;
  341. }
  342. /** Thrown by {@code create()} when file is too large. */
  343. public static class TableFullException extends Exception {
  344. private static final long serialVersionUID = 1L;
  345. }
  346. }