You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

SimilarityIndex.java 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. /*
  2. * Copyright (C) 2010, Google Inc.
  3. * and other copyright owners as documented in the project's IP log.
  4. *
  5. * This program and the accompanying materials are made available
  6. * under the terms of the Eclipse Distribution License v1.0 which
  7. * accompanies this distribution, is reproduced below, and is
  8. * available at http://www.eclipse.org/org/documents/edl-v10.php
  9. *
  10. * All rights reserved.
  11. *
  12. * Redistribution and use in source and binary forms, with or
  13. * without modification, are permitted provided that the following
  14. * conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above copyright
  17. * notice, this list of conditions and the following disclaimer.
  18. *
  19. * - Redistributions in binary form must reproduce the above
  20. * copyright notice, this list of conditions and the following
  21. * disclaimer in the documentation and/or other materials provided
  22. * with the distribution.
  23. *
  24. * - Neither the name of the Eclipse Foundation, Inc. nor the
  25. * names of its contributors may be used to endorse or promote
  26. * products derived from this software without specific prior
  27. * written permission.
  28. *
  29. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  30. * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  31. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  32. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  33. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  34. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  35. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  36. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  37. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  38. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  39. * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  40. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  41. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  42. */
  43. package org.eclipse.jgit.diff;
  44. import java.io.EOFException;
  45. import java.io.IOException;
  46. import java.io.InputStream;
  47. import java.util.Arrays;
  48. import org.eclipse.jgit.errors.MissingObjectException;
  49. import org.eclipse.jgit.lib.ObjectLoader;
  50. import org.eclipse.jgit.lib.ObjectStream;
  51. /**
  52. * Index structure of lines/blocks in one file.
  53. * <p>
  54. * This structure can be used to compute an approximation of the similarity
  55. * between two files. The index is used by {@link SimilarityRenameDetector} to
  56. * compute scores between files.
  57. * <p>
  58. * To save space in memory, this index uses a space efficient encoding which
  59. * will not exceed 1 MiB per instance. The index starts out at a smaller size
  60. * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
  61. * file are discovered.
  62. *
  63. * @since 4.0
  64. */
  65. public class SimilarityIndex {
  66. /** A special {@link TableFullException} used in place of OutOfMemoryError. */
  67. public static final TableFullException
  68. TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
  69. /**
  70. * Shift to apply before storing a key.
  71. * <p>
  72. * Within the 64 bit table record space, we leave the highest bit unset so
  73. * all values are positive. The lower 32 bits to count bytes.
  74. */
  75. private static final int KEY_SHIFT = 32;
  76. /** Maximum value of the count field, also mask to extract the count. */
  77. private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
  78. /**
  79. * Total amount of bytes hashed into the structure, including \n. This is
  80. * usually the size of the file minus number of CRLF encounters.
  81. */
  82. private long hashedCnt;
  83. /** Number of non-zero entries in {@link #idHash}. */
  84. private int idSize;
  85. /** {@link #idSize} that triggers {@link #idHash} to double in size. */
  86. private int idGrowAt;
  87. /**
  88. * Pairings of content keys and counters.
  89. * <p>
  90. * Slots in the table are actually two ints wedged into a single long. The
  91. * upper 32 bits stores the content key, and the remaining lower bits stores
  92. * the number of bytes associated with that key. Empty slots are denoted by
  93. * 0, which cannot occur because the count cannot be 0. Values can only be
  94. * positive, which we enforce during key addition.
  95. */
  96. private long[] idHash;
  97. /** {@code idHash.length == 1 << idHashBits}. */
  98. private int idHashBits;
  99. /**
  100. * Create a new similarity index for the given object
  101. *
  102. * @param obj
  103. * the object to hash
  104. * @return similarity index for this object
  105. * @throws IOException
  106. * file contents cannot be read from the repository.
  107. * @throws TableFullException
  108. * object hashing overflowed the storage capacity of the
  109. * SimilarityIndex.
  110. */
  111. public static SimilarityIndex create(ObjectLoader obj) throws IOException,
  112. TableFullException {
  113. SimilarityIndex idx = new SimilarityIndex();
  114. idx.hash(obj);
  115. idx.sort();
  116. return idx;
  117. }
  118. SimilarityIndex() {
  119. idHashBits = 8;
  120. idHash = new long[1 << idHashBits];
  121. idGrowAt = growAt(idHashBits);
  122. }
  123. void hash(ObjectLoader obj) throws MissingObjectException, IOException,
  124. TableFullException {
  125. if (obj.isLarge()) {
  126. hashLargeObject(obj);
  127. } else {
  128. byte[] raw = obj.getCachedBytes();
  129. hash(raw, 0, raw.length);
  130. }
  131. }
  132. private void hashLargeObject(ObjectLoader obj) throws IOException,
  133. TableFullException {
  134. ObjectStream in1 = obj.openStream();
  135. boolean text;
  136. try {
  137. text = !RawText.isBinary(in1);
  138. } finally {
  139. in1.close();
  140. }
  141. ObjectStream in2 = obj.openStream();
  142. try {
  143. hash(in2, in2.getSize(), text);
  144. } finally {
  145. in2.close();
  146. }
  147. }
  148. void hash(byte[] raw, int ptr, final int end) throws TableFullException {
  149. final boolean text = !RawText.isBinary(raw);
  150. hashedCnt = 0;
  151. while (ptr < end) {
  152. int hash = 5381;
  153. int blockHashedCnt = 0;
  154. int start = ptr;
  155. // Hash one line, or one block, whichever occurs first.
  156. do {
  157. int c = raw[ptr++] & 0xff;
  158. // Ignore CR in CRLF sequence if text
  159. if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
  160. continue;
  161. blockHashedCnt++;
  162. if (c == '\n')
  163. break;
  164. hash = (hash << 5) + hash + c;
  165. } while (ptr < end && ptr - start < 64);
  166. hashedCnt += blockHashedCnt;
  167. add(hash, blockHashedCnt);
  168. }
  169. }
  170. void hash(InputStream in, long remaining, boolean text) throws IOException,
  171. TableFullException {
  172. byte[] buf = new byte[4096];
  173. int ptr = 0;
  174. int cnt = 0;
  175. while (0 < remaining) {
  176. int hash = 5381;
  177. int blockHashedCnt = 0;
  178. // Hash one line, or one block, whichever occurs first.
  179. int n = 0;
  180. do {
  181. if (ptr == cnt) {
  182. ptr = 0;
  183. cnt = in.read(buf, 0, buf.length);
  184. if (cnt <= 0)
  185. throw new EOFException();
  186. }
  187. n++;
  188. int c = buf[ptr++] & 0xff;
  189. // Ignore CR in CRLF sequence if text
  190. if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
  191. continue;
  192. blockHashedCnt++;
  193. if (c == '\n')
  194. break;
  195. hash = (hash << 5) + hash + c;
  196. } while (n < 64 && n < remaining);
  197. hashedCnt += blockHashedCnt;
  198. add(hash, blockHashedCnt);
  199. remaining -= n;
  200. }
  201. }
  202. /**
  203. * Sort the internal table so it can be used for efficient scoring.
  204. * <p>
  205. * Once sorted, additional lines/blocks cannot be added to the index.
  206. */
  207. void sort() {
  208. // Sort the array. All of the empty space will wind up at the front,
  209. // because we forced all of the keys to always be positive. Later
  210. // we only work with the back half of the array.
  211. //
  212. Arrays.sort(idHash);
  213. }
  214. /**
  215. * Compute the similarity score between this index and another.
  216. * <p>
  217. * A region of a file is defined as a line in a text file or a fixed-size
  218. * block in a binary file. To prepare an index, each region in the file is
  219. * hashed; the values and counts of hashes are retained in a sorted table.
  220. * Define the similarity fraction F as the the count of matching regions
  221. * between the two files divided between the maximum count of regions in
  222. * either file. The similarity score is F multiplied by the maxScore
  223. * constant, yielding a range [0, maxScore]. It is defined as maxScore for
  224. * the degenerate case of two empty files.
  225. * <p>
  226. * The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
  227. *
  228. * @param dst
  229. * the other index
  230. * @param maxScore
  231. * the score representing a 100% match
  232. * @return the similarity score
  233. */
  234. public int score(SimilarityIndex dst, int maxScore) {
  235. long max = Math.max(hashedCnt, dst.hashedCnt);
  236. if (max == 0)
  237. return maxScore;
  238. return (int) ((common(dst) * maxScore) / max);
  239. }
  240. long common(SimilarityIndex dst) {
  241. return common(this, dst);
  242. }
  243. private static long common(SimilarityIndex src, SimilarityIndex dst) {
  244. int srcIdx = src.packedIndex(0);
  245. int dstIdx = dst.packedIndex(0);
  246. long[] srcHash = src.idHash;
  247. long[] dstHash = dst.idHash;
  248. return common(srcHash, srcIdx, dstHash, dstIdx);
  249. }
  250. private static long common(long[] srcHash, int srcIdx, //
  251. long[] dstHash, int dstIdx) {
  252. if (srcIdx == srcHash.length || dstIdx == dstHash.length)
  253. return 0;
  254. long common = 0;
  255. int srcKey = keyOf(srcHash[srcIdx]);
  256. int dstKey = keyOf(dstHash[dstIdx]);
  257. for (;;) {
  258. if (srcKey == dstKey) {
  259. common += Math.min(countOf(srcHash[srcIdx]),
  260. countOf(dstHash[dstIdx]));
  261. if (++srcIdx == srcHash.length)
  262. break;
  263. srcKey = keyOf(srcHash[srcIdx]);
  264. if (++dstIdx == dstHash.length)
  265. break;
  266. dstKey = keyOf(dstHash[dstIdx]);
  267. } else if (srcKey < dstKey) {
  268. // Regions of src which do not appear in dst.
  269. if (++srcIdx == srcHash.length)
  270. break;
  271. srcKey = keyOf(srcHash[srcIdx]);
  272. } else /* if (dstKey < srcKey) */{
  273. // Regions of dst which do not appear in src.
  274. if (++dstIdx == dstHash.length)
  275. break;
  276. dstKey = keyOf(dstHash[dstIdx]);
  277. }
  278. }
  279. return common;
  280. }
  281. // Testing only
  282. int size() {
  283. return idSize;
  284. }
  285. // Testing only
  286. int key(int idx) {
  287. return keyOf(idHash[packedIndex(idx)]);
  288. }
  289. // Testing only
  290. long count(int idx) {
  291. return countOf(idHash[packedIndex(idx)]);
  292. }
  293. // Brute force approach only for testing.
  294. int findIndex(int key) {
  295. for (int i = 0; i < idSize; i++)
  296. if (key(i) == key)
  297. return i;
  298. return -1;
  299. }
  300. private int packedIndex(int idx) {
  301. return (idHash.length - idSize) + idx;
  302. }
  303. void add(int key, int cnt) throws TableFullException {
  304. key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
  305. int j = slot(key);
  306. for (;;) {
  307. long v = idHash[j];
  308. if (v == 0) {
  309. // Empty slot in the table, store here.
  310. if (idGrowAt <= idSize) {
  311. grow();
  312. j = slot(key);
  313. continue;
  314. }
  315. idHash[j] = pair(key, cnt);
  316. idSize++;
  317. return;
  318. } else if (keyOf(v) == key) {
  319. // Same key, increment the counter. If it overflows, fail
  320. // indexing to prevent the key from being impacted.
  321. //
  322. idHash[j] = pair(key, countOf(v) + cnt);
  323. return;
  324. } else if (++j >= idHash.length) {
  325. j = 0;
  326. }
  327. }
  328. }
  329. private static long pair(int key, long cnt) throws TableFullException {
  330. if (MAX_COUNT < cnt)
  331. throw new TableFullException();
  332. return (((long) key) << KEY_SHIFT) | cnt;
  333. }
  334. private int slot(int key) {
  335. // We use 31 - idHashBits because the upper bit was already forced
  336. // to be 0 and we want the remaining high bits to be used as the
  337. // table slot.
  338. //
  339. return key >>> (31 - idHashBits);
  340. }
  341. private static int growAt(int idHashBits) {
  342. return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
  343. }
  344. private void grow() throws TableFullException {
  345. if (idHashBits == 30)
  346. throw new TableFullException();
  347. long[] oldHash = idHash;
  348. int oldSize = idHash.length;
  349. idHashBits++;
  350. idGrowAt = growAt(idHashBits);
  351. try {
  352. idHash = new long[1 << idHashBits];
  353. } catch (OutOfMemoryError noMemory) {
  354. throw TABLE_FULL_OUT_OF_MEMORY;
  355. }
  356. for (int i = 0; i < oldSize; i++) {
  357. long v = oldHash[i];
  358. if (v != 0) {
  359. int j = slot(keyOf(v));
  360. while (idHash[j] != 0)
  361. if (++j >= idHash.length)
  362. j = 0;
  363. idHash[j] = v;
  364. }
  365. }
  366. }
  367. private static int keyOf(long v) {
  368. return (int) (v >>> KEY_SHIFT);
  369. }
  370. private static long countOf(long v) {
  371. return v & MAX_COUNT;
  372. }
  373. /** Thrown by {@code create()} when file is too large. */
  374. public static class TableFullException extends Exception {
  375. private static final long serialVersionUID = 1L;
  376. }
  377. }