You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

SimilarityIndex.java 8.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. /*
  2. * Copyright (C) 2010, Google Inc.
  3. * and other copyright owners as documented in the project's IP log.
  4. *
  5. * This program and the accompanying materials are made available
  6. * under the terms of the Eclipse Distribution License v1.0 which
  7. * accompanies this distribution, is reproduced below, and is
  8. * available at http://www.eclipse.org/org/documents/edl-v10.php
  9. *
  10. * All rights reserved.
  11. *
  12. * Redistribution and use in source and binary forms, with or
  13. * without modification, are permitted provided that the following
  14. * conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above copyright
  17. * notice, this list of conditions and the following disclaimer.
  18. *
  19. * - Redistributions in binary form must reproduce the above
  20. * copyright notice, this list of conditions and the following
  21. * disclaimer in the documentation and/or other materials provided
  22. * with the distribution.
  23. *
  24. * - Neither the name of the Eclipse Foundation, Inc. nor the
  25. * names of its contributors may be used to endorse or promote
  26. * products derived from this software without specific prior
  27. * written permission.
  28. *
  29. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  30. * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  31. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  32. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  33. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  34. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  35. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  36. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  37. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  38. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  39. * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  40. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  41. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  42. */
  43. package org.eclipse.jgit.diff;
  44. import java.io.EOFException;
  45. import java.io.IOException;
  46. import java.io.InputStream;
  47. import java.util.Arrays;
  48. import org.eclipse.jgit.errors.MissingObjectException;
  49. import org.eclipse.jgit.lib.ObjectLoader;
  50. import org.eclipse.jgit.lib.ObjectStream;
  51. /**
  52. * Index structure of lines/blocks in one file.
  53. * <p>
  54. * This structure can be used to compute an approximation of the similarity
  55. * between two files. The index is used by {@link SimilarityRenameDetector} to
  56. * compute scores between files.
  57. * <p>
  58. * To save space in memory, this index uses a space efficient encoding which
  59. * will not exceed 1 MiB per instance. The index starts out at a smaller size
  60. * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
  61. * file are discovered.
  62. */
  63. class SimilarityIndex {
  64. /** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
  65. private static final int MAX_HASH_BITS = 17;
  66. /** The {@link #idHash} table will not grow bigger than this, ever. */
  67. private static final int MAX_HASH_SIZE = 1 << MAX_HASH_BITS;
  68. /** Prime just before {@link #MAX_HASH_SIZE}. */
  69. private static final int P = 131071;
  70. /**
  71. * Shift to apply before storing a key.
  72. * <p>
  73. * Within the 64 bit table record space, we leave the highest bit unset so
  74. * all values are positive, and we need {@link #MAX_HASH_BITS} bits for the
  75. * keys. The lower 32 bits are used to count bytes impacted.
  76. */
  77. private static final int KEY_SHIFT = 64 - 1 - MAX_HASH_BITS;
  78. /** Total size of the file we hashed into the structure. */
  79. private long fileSize;
  80. /** Number of non-zero entries in {@link #idHash}. */
  81. private int idSize;
  82. /**
  83. * Pairings of content keys and counters.
  84. * <p>
  85. * Slots in the table are actually two ints wedged into a single long. The
  86. * upper {@link #MAX_HASH_BITS} bits stores the content key, and the
  87. * remaining lower bits stores the number of bytes associated with that key.
  88. * Empty slots are denoted by 0, which cannot occur because the count cannot
  89. * be 0. Values can only be positive, which we enforce during key addition.
  90. */
  91. private long[] idHash;
  92. SimilarityIndex() {
  93. idHash = new long[256];
  94. }
  95. long getFileSize() {
  96. return fileSize;
  97. }
  98. void setFileSize(long size) {
  99. fileSize = size;
  100. }
  101. void hash(ObjectLoader obj) throws MissingObjectException, IOException {
  102. if (obj.isLarge()) {
  103. ObjectStream in = obj.openStream();
  104. try {
  105. setFileSize(in.getSize());
  106. hash(in, fileSize);
  107. } finally {
  108. in.close();
  109. }
  110. } else {
  111. byte[] raw = obj.getCachedBytes();
  112. setFileSize(raw.length);
  113. hash(raw, 0, raw.length);
  114. }
  115. }
  116. void hash(byte[] raw, int ptr, final int end) {
  117. while (ptr < end) {
  118. int hash = 5381;
  119. int start = ptr;
  120. // Hash one line, or one block, whichever occurs first.
  121. do {
  122. int c = raw[ptr++] & 0xff;
  123. if (c == '\n')
  124. break;
  125. hash = (hash << 5) ^ c;
  126. } while (ptr < end && ptr - start < 64);
  127. add(hash, ptr - start);
  128. }
  129. }
  130. void hash(InputStream in, long remaining) throws IOException {
  131. byte[] buf = new byte[4096];
  132. int ptr = 0;
  133. int cnt = 0;
  134. while (0 < remaining) {
  135. int hash = 5381;
  136. // Hash one line, or one block, whichever occurs first.
  137. int n = 0;
  138. do {
  139. if (ptr == cnt) {
  140. ptr = 0;
  141. cnt = in.read(buf, 0, buf.length);
  142. if (cnt <= 0)
  143. throw new EOFException();
  144. }
  145. n++;
  146. int c = buf[ptr++] & 0xff;
  147. if (c == '\n')
  148. break;
  149. hash = (hash << 5) ^ c;
  150. } while (n < 64 && n < remaining);
  151. add(hash, n);
  152. remaining -= n;
  153. }
  154. }
  155. /**
  156. * Sort the internal table so it can be used for efficient scoring.
  157. * <p>
  158. * Once sorted, additional lines/blocks cannot be added to the index.
  159. */
  160. void sort() {
  161. // Sort the array. All of the empty space will wind up at the front,
  162. // because we forced all of the keys to always be positive. Later
  163. // we only work with the back half of the array.
  164. //
  165. Arrays.sort(idHash);
  166. }
  167. int score(SimilarityIndex dst, int maxScore) {
  168. long max = Math.max(fileSize, dst.fileSize);
  169. if (max == 0)
  170. return maxScore;
  171. return (int) ((common(dst) * maxScore) / max);
  172. }
  173. int common(SimilarityIndex dst) {
  174. return common(this, dst);
  175. }
  176. private static int common(SimilarityIndex src, SimilarityIndex dst) {
  177. int srcIdx = src.packedIndex(0);
  178. int dstIdx = dst.packedIndex(0);
  179. long[] srcHash = src.idHash;
  180. long[] dstHash = dst.idHash;
  181. return common(srcHash, srcIdx, dstHash, dstIdx);
  182. }
  183. private static int common(long[] srcHash, int srcIdx, //
  184. long[] dstHash, int dstIdx) {
  185. if (srcIdx == srcHash.length || dstIdx == dstHash.length)
  186. return 0;
  187. int common = 0;
  188. int srcKey = keyOf(srcHash[srcIdx]);
  189. int dstKey = keyOf(dstHash[dstIdx]);
  190. for (;;) {
  191. if (srcKey == dstKey) {
  192. common += Math.min(countOf(srcHash[srcIdx]),
  193. countOf(dstHash[dstIdx]));
  194. if (++srcIdx == srcHash.length)
  195. break;
  196. srcKey = keyOf(srcHash[srcIdx]);
  197. if (++dstIdx == dstHash.length)
  198. break;
  199. dstKey = keyOf(dstHash[dstIdx]);
  200. } else if (srcKey < dstKey) {
  201. // Regions of src which do not appear in dst.
  202. if (++srcIdx == srcHash.length)
  203. break;
  204. srcKey = keyOf(srcHash[srcIdx]);
  205. } else /* if (srcKey > dstKey) */{
  206. // Regions of dst which do not appear in dst.
  207. if (++dstIdx == dstHash.length)
  208. break;
  209. dstKey = keyOf(dstHash[dstIdx]);
  210. }
  211. }
  212. return common;
  213. }
  214. // Testing only
  215. int size() {
  216. return idSize;
  217. }
  218. // Testing only
  219. int key(int idx) {
  220. return keyOf(idHash[packedIndex(idx)]);
  221. }
  222. // Testing only
  223. long count(int idx) {
  224. return countOf(idHash[packedIndex(idx)]);
  225. }
  226. // Brute force approach only for testing.
  227. int findIndex(int key) {
  228. for (int i = 0; i < idSize; i++)
  229. if (key(i) == key)
  230. return i;
  231. return -1;
  232. }
  233. private int packedIndex(int idx) {
  234. return (idHash.length - idSize) + idx;
  235. }
  236. void add(int key, int cnt) {
  237. key = hash(key);
  238. int j = slot(key);
  239. for (;;) {
  240. long v = idHash[j];
  241. if (v == 0) {
  242. // Empty slot in the table, store here.
  243. if (shouldGrow()) {
  244. grow();
  245. j = slot(key);
  246. continue;
  247. }
  248. idHash[j] = (((long) key) << KEY_SHIFT) | cnt;
  249. idSize++;
  250. return;
  251. } else if (keyOf(v) == key) {
  252. // Same key, increment the counter.
  253. idHash[j] = v + cnt;
  254. return;
  255. } else if (++j >= idHash.length) {
  256. j = 0;
  257. }
  258. }
  259. }
  260. private static int hash(int key) {
  261. // Make the key fit into our table. Since we have a maximum size
  262. // that we cap the table at, all keys get squashed before going
  263. // into the table. This prevents overflow.
  264. //
  265. return (key >>> 1) % P;
  266. }
  267. private int slot(int key) {
  268. return key % idHash.length;
  269. }
  270. private boolean shouldGrow() {
  271. int n = idHash.length;
  272. return n < MAX_HASH_SIZE && n <= idSize * 2;
  273. }
  274. private void grow() {
  275. long[] oldHash = idHash;
  276. int oldSize = idHash.length;
  277. idHash = new long[2 * oldSize];
  278. for (int i = 0; i < oldSize; i++) {
  279. long v = oldHash[i];
  280. if (v != 0) {
  281. int j = slot(keyOf(v));
  282. while (idHash[j] != 0)
  283. if (++j >= idHash.length)
  284. j = 0;
  285. idHash[j] = v;
  286. }
  287. }
  288. }
  289. private static int keyOf(long v) {
  290. return (int) (v >>> KEY_SHIFT);
  291. }
  292. private static int countOf(long v) {
  293. return (int) v;
  294. }
  295. }