You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TextHashFunctions.java 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. /*
  2. * Copyright (C) 2010, Google Inc.
  3. * and other copyright owners as documented in the project's IP log.
  4. *
  5. * This program and the accompanying materials are made available
  6. * under the terms of the Eclipse Distribution License v1.0 which
  7. * accompanies this distribution, is reproduced below, and is
  8. * available at http://www.eclipse.org/org/documents/edl-v10.php
  9. *
  10. * All rights reserved.
  11. *
  12. * Redistribution and use in source and binary forms, with or
  13. * without modification, are permitted provided that the following
  14. * conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above copyright
  17. * notice, this list of conditions and the following disclaimer.
  18. *
  19. * - Redistributions in binary form must reproduce the above
  20. * copyright notice, this list of conditions and the following
  21. * disclaimer in the documentation and/or other materials provided
  22. * with the distribution.
  23. *
  24. * - Neither the name of the Eclipse Foundation, Inc. nor the
  25. * names of its contributors may be used to endorse or promote
  26. * products derived from this software without specific prior
  27. * written permission.
  28. *
  29. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  30. * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  31. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  32. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  33. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  34. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  35. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  36. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  37. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  38. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  39. * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  40. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  41. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  42. */
  43. package org.eclipse.jgit.pgm.debug;
  44. import static java.lang.Integer.valueOf;
  45. import static java.lang.Long.valueOf;
  46. import java.io.File;
  47. import java.lang.reflect.Field;
  48. import java.security.MessageDigest;
  49. import java.util.ArrayList;
  50. import java.util.Arrays;
  51. import java.util.HashSet;
  52. import java.util.List;
  53. import org.eclipse.jgit.diff.RawText;
  54. import org.eclipse.jgit.diff.RawTextComparator;
  55. import org.eclipse.jgit.errors.LargeObjectException;
  56. import org.eclipse.jgit.lib.Constants;
  57. import org.eclipse.jgit.lib.FileMode;
  58. import org.eclipse.jgit.lib.MutableObjectId;
  59. import org.eclipse.jgit.lib.ObjectReader;
  60. import org.eclipse.jgit.lib.Repository;
  61. import org.eclipse.jgit.lib.RepositoryBuilder;
  62. import org.eclipse.jgit.lib.RepositoryCache;
  63. import org.eclipse.jgit.pgm.Command;
  64. import org.eclipse.jgit.pgm.TextBuiltin;
  65. import org.eclipse.jgit.pgm.internal.CLIText;
  66. import org.eclipse.jgit.revwalk.RevWalk;
  67. import org.eclipse.jgit.treewalk.TreeWalk;
  68. import org.eclipse.jgit.util.FS;
  69. import org.eclipse.jgit.util.NB;
  70. import org.kohsuke.args4j.Option;
  71. /**
  72. * Scan repository to compute maximum number of collisions for hash functions.
  73. *
  74. * This is a test suite to help benchmark the collision rate of hash functions
  75. * when applied to file contents in a Git repository. The test scans all text
  76. * files in the HEAD revision of the repository it is run within. For each file
  77. * it finds the unique lines, and then inserts those lines into a hash table to
  78. * determine collision rates under the selected hash functions.
  79. *
  80. * To add another hash function to the test suite, declare a new instance member
  81. * field of type {@link Hash} and implement the hashRegion method. The test
  82. * suite will automatically pick up the new function through reflection.
  83. *
  84. * To add another folding function (method of squashing a 32 bit hash code into
  85. * the hash tables smaller array index space), declare a new instance field of
  86. * type {@link Fold} and implement the logic. The test suite will automatically
  87. * pick up the new function through reflection.
  88. */
  89. @Command(usage = "usage_TextHashFunctions")
  90. class TextHashFunctions extends TextBuiltin {
  91. /** Standard SHA-1 on the line, using the first 4 bytes as the hash code. */
  92. final Hash sha1 = new Hash() {
  93. private final MessageDigest md = Constants.newMessageDigest();
  94. @Override
  95. protected int hashRegion(byte[] raw, int ptr, int end) {
  96. md.reset();
  97. md.update(raw, ptr, end - ptr);
  98. return NB.decodeInt32(md.digest(), 0);
  99. }
  100. };
  101. /** Professor Daniel J. Bernstein's rather popular string hash. */
  102. final Hash djb = new Hash() {
  103. @Override
  104. protected int hashRegion(byte[] raw, int ptr, int end) {
  105. int hash = 5381;
  106. for (; ptr < end; ptr++)
  107. hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
  108. return hash;
  109. }
  110. };
  111. /** Hash function commonly used by java.lang.String. */
  112. final Hash string_hash31 = new Hash() {
  113. @Override
  114. protected int hashRegion(byte[] raw, int ptr, int end) {
  115. int hash = 0;
  116. for (; ptr < end; ptr++)
  117. hash = 31 * hash + (raw[ptr] & 0xff);
  118. return hash;
  119. }
  120. };
  121. /** The Rabin polynomial hash that is used by our own DeltaIndex. */
  122. final Hash rabin_DeltaIndex = new Hash() {
  123. private final byte[] buf16 = new byte[16];
  124. @Override
  125. protected int hashRegion(byte[] raw, int ptr, int end) {
  126. if (end - ptr < 16) {
  127. Arrays.fill(buf16, (byte) 0);
  128. System.arraycopy(raw, ptr, buf16, 0, end - ptr);
  129. return rabin(buf16, 0);
  130. } else {
  131. return rabin(raw, ptr);
  132. }
  133. }
  134. private int rabin(byte[] raw, int ptr) {
  135. int hash;
  136. // The first 4 steps collapse out into a 4 byte big-endian decode,
  137. // with a larger right shift as we combined shift lefts together.
  138. //
  139. hash = ((raw[ptr] & 0xff) << 24) //
  140. | ((raw[ptr + 1] & 0xff) << 16) //
  141. | ((raw[ptr + 2] & 0xff) << 8) //
  142. | (raw[ptr + 3] & 0xff);
  143. hash ^= T[hash >>> 31];
  144. hash = ((hash << 8) | (raw[ptr + 4] & 0xff)) ^ T[hash >>> 23];
  145. hash = ((hash << 8) | (raw[ptr + 5] & 0xff)) ^ T[hash >>> 23];
  146. hash = ((hash << 8) | (raw[ptr + 6] & 0xff)) ^ T[hash >>> 23];
  147. hash = ((hash << 8) | (raw[ptr + 7] & 0xff)) ^ T[hash >>> 23];
  148. hash = ((hash << 8) | (raw[ptr + 8] & 0xff)) ^ T[hash >>> 23];
  149. hash = ((hash << 8) | (raw[ptr + 9] & 0xff)) ^ T[hash >>> 23];
  150. hash = ((hash << 8) | (raw[ptr + 10] & 0xff)) ^ T[hash >>> 23];
  151. hash = ((hash << 8) | (raw[ptr + 11] & 0xff)) ^ T[hash >>> 23];
  152. hash = ((hash << 8) | (raw[ptr + 12] & 0xff)) ^ T[hash >>> 23];
  153. hash = ((hash << 8) | (raw[ptr + 13] & 0xff)) ^ T[hash >>> 23];
  154. hash = ((hash << 8) | (raw[ptr + 14] & 0xff)) ^ T[hash >>> 23];
  155. hash = ((hash << 8) | (raw[ptr + 15] & 0xff)) ^ T[hash >>> 23];
  156. return hash;
  157. }
  158. private final int[] T = { 0x00000000, 0xd4c6b32d, 0x7d4bd577,
  159. 0xa98d665a, 0x2e5119c3, 0xfa97aaee, 0x531accb4, 0x87dc7f99,
  160. 0x5ca23386, 0x886480ab, 0x21e9e6f1, 0xf52f55dc, 0x72f32a45,
  161. 0xa6359968, 0x0fb8ff32, 0xdb7e4c1f, 0x6d82d421, 0xb944670c,
  162. 0x10c90156, 0xc40fb27b, 0x43d3cde2, 0x97157ecf, 0x3e981895,
  163. 0xea5eabb8, 0x3120e7a7, 0xe5e6548a, 0x4c6b32d0, 0x98ad81fd,
  164. 0x1f71fe64, 0xcbb74d49, 0x623a2b13, 0xb6fc983e, 0x0fc31b6f,
  165. 0xdb05a842, 0x7288ce18, 0xa64e7d35, 0x219202ac, 0xf554b181,
  166. 0x5cd9d7db, 0x881f64f6, 0x536128e9, 0x87a79bc4, 0x2e2afd9e,
  167. 0xfaec4eb3, 0x7d30312a, 0xa9f68207, 0x007be45d, 0xd4bd5770,
  168. 0x6241cf4e, 0xb6877c63, 0x1f0a1a39, 0xcbcca914, 0x4c10d68d,
  169. 0x98d665a0, 0x315b03fa, 0xe59db0d7, 0x3ee3fcc8, 0xea254fe5,
  170. 0x43a829bf, 0x976e9a92, 0x10b2e50b, 0xc4745626, 0x6df9307c,
  171. 0xb93f8351, 0x1f8636de, 0xcb4085f3, 0x62cde3a9, 0xb60b5084,
  172. 0x31d72f1d, 0xe5119c30, 0x4c9cfa6a, 0x985a4947, 0x43240558,
  173. 0x97e2b675, 0x3e6fd02f, 0xeaa96302, 0x6d751c9b, 0xb9b3afb6,
  174. 0x103ec9ec, 0xc4f87ac1, 0x7204e2ff, 0xa6c251d2, 0x0f4f3788,
  175. 0xdb8984a5, 0x5c55fb3c, 0x88934811, 0x211e2e4b, 0xf5d89d66,
  176. 0x2ea6d179, 0xfa606254, 0x53ed040e, 0x872bb723, 0x00f7c8ba,
  177. 0xd4317b97, 0x7dbc1dcd, 0xa97aaee0, 0x10452db1, 0xc4839e9c,
  178. 0x6d0ef8c6, 0xb9c84beb, 0x3e143472, 0xead2875f, 0x435fe105,
  179. 0x97995228, 0x4ce71e37, 0x9821ad1a, 0x31accb40, 0xe56a786d,
  180. 0x62b607f4, 0xb670b4d9, 0x1ffdd283, 0xcb3b61ae, 0x7dc7f990,
  181. 0xa9014abd, 0x008c2ce7, 0xd44a9fca, 0x5396e053, 0x8750537e,
  182. 0x2edd3524, 0xfa1b8609, 0x2165ca16, 0xf5a3793b, 0x5c2e1f61,
  183. 0x88e8ac4c, 0x0f34d3d5, 0xdbf260f8, 0x727f06a2, 0xa6b9b58f,
  184. 0x3f0c6dbc, 0xebcade91, 0x4247b8cb, 0x96810be6, 0x115d747f,
  185. 0xc59bc752, 0x6c16a108, 0xb8d01225, 0x63ae5e3a, 0xb768ed17,
  186. 0x1ee58b4d, 0xca233860, 0x4dff47f9, 0x9939f4d4, 0x30b4928e,
  187. 0xe47221a3, 0x528eb99d, 0x86480ab0, 0x2fc56cea, 0xfb03dfc7,
  188. 0x7cdfa05e, 0xa8191373, 0x01947529, 0xd552c604, 0x0e2c8a1b,
  189. 0xdaea3936, 0x73675f6c, 0xa7a1ec41, 0x207d93d8, 0xf4bb20f5,
  190. 0x5d3646af, 0x89f0f582, 0x30cf76d3, 0xe409c5fe, 0x4d84a3a4,
  191. 0x99421089, 0x1e9e6f10, 0xca58dc3d, 0x63d5ba67, 0xb713094a,
  192. 0x6c6d4555, 0xb8abf678, 0x11269022, 0xc5e0230f, 0x423c5c96,
  193. 0x96faefbb, 0x3f7789e1, 0xebb13acc, 0x5d4da2f2, 0x898b11df,
  194. 0x20067785, 0xf4c0c4a8, 0x731cbb31, 0xa7da081c, 0x0e576e46,
  195. 0xda91dd6b, 0x01ef9174, 0xd5292259, 0x7ca44403, 0xa862f72e,
  196. 0x2fbe88b7, 0xfb783b9a, 0x52f55dc0, 0x8633eeed, 0x208a5b62,
  197. 0xf44ce84f, 0x5dc18e15, 0x89073d38, 0x0edb42a1, 0xda1df18c,
  198. 0x739097d6, 0xa75624fb, 0x7c2868e4, 0xa8eedbc9, 0x0163bd93,
  199. 0xd5a50ebe, 0x52797127, 0x86bfc20a, 0x2f32a450, 0xfbf4177d,
  200. 0x4d088f43, 0x99ce3c6e, 0x30435a34, 0xe485e919, 0x63599680,
  201. 0xb79f25ad, 0x1e1243f7, 0xcad4f0da, 0x11aabcc5, 0xc56c0fe8,
  202. 0x6ce169b2, 0xb827da9f, 0x3ffba506, 0xeb3d162b, 0x42b07071,
  203. 0x9676c35c, 0x2f49400d, 0xfb8ff320, 0x5202957a, 0x86c42657,
  204. 0x011859ce, 0xd5deeae3, 0x7c538cb9, 0xa8953f94, 0x73eb738b,
  205. 0xa72dc0a6, 0x0ea0a6fc, 0xda6615d1, 0x5dba6a48, 0x897cd965,
  206. 0x20f1bf3f, 0xf4370c12, 0x42cb942c, 0x960d2701, 0x3f80415b,
  207. 0xeb46f276, 0x6c9a8def, 0xb85c3ec2, 0x11d15898, 0xc517ebb5,
  208. 0x1e69a7aa, 0xcaaf1487, 0x632272dd, 0xb7e4c1f0, 0x3038be69,
  209. 0xe4fe0d44, 0x4d736b1e, 0x99b5d833 };
  210. };
  211. /** Bitwise-and to extract only the low bits. */
  212. final Fold truncate = new Fold() {
  213. @Override
  214. public int fold(int hash, int bits) {
  215. return hash & ((1 << bits) - 1);
  216. }
  217. };
  218. /** Applies the golden ratio and takes the upper bits. */
  219. final Fold golden_ratio = new Fold() {
  220. @Override
  221. public int fold(int hash, int bits) {
  222. /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
  223. return (hash * 0x9e370001) >>> (32 - bits);
  224. }
  225. };
  226. // -----------------------------------------------------------------------
  227. //
  228. // Implementation of the suite lives below this line.
  229. //
  230. //
  231. @Option(name = "--hash", multiValued = true, metaVar = "NAME", usage = "Enable hash function(s)")
  232. List<String> hashFunctions = new ArrayList<>();
  233. @Option(name = "--fold", multiValued = true, metaVar = "NAME", usage = "Enable fold function(s)")
  234. List<String> foldFunctions = new ArrayList<>();
  235. @Option(name = "--text-limit", metaVar = "LIMIT", usage = "Maximum size in KiB to scan")
  236. int textLimit = 15 * 1024; // 15 MiB as later we do * 1024.
  237. @Option(name = "--repository", aliases = { "-r" }, multiValued = true, metaVar = "GIT_DIR", usage = "Repository to scan")
  238. List<File> gitDirs = new ArrayList<>();
  239. @Override
  240. protected boolean requiresRepository() {
  241. return false;
  242. }
  243. @Override
  244. protected void run() throws Exception {
  245. if (gitDirs.isEmpty()) {
  246. RepositoryBuilder rb = new RepositoryBuilder() //
  247. .setGitDir(new File(gitdir)) //
  248. .readEnvironment() //
  249. .findGitDir();
  250. if (rb.getGitDir() == null)
  251. throw die(CLIText.get().cantFindGitDirectory);
  252. gitDirs.add(rb.getGitDir());
  253. }
  254. for (File dir : gitDirs) {
  255. RepositoryBuilder rb = new RepositoryBuilder();
  256. if (RepositoryCache.FileKey.isGitRepository(dir, FS.DETECTED))
  257. rb.setGitDir(dir);
  258. else
  259. rb.findGitDir(dir);
  260. Repository repo = rb.build();
  261. try {
  262. run(repo);
  263. } finally {
  264. repo.close();
  265. }
  266. }
  267. }
  268. private void run(Repository repo) throws Exception {
  269. List<Function> all = init();
  270. long fileCnt = 0;
  271. long lineCnt = 0;
  272. try (ObjectReader or = repo.newObjectReader();
  273. RevWalk rw = new RevWalk(or);
  274. TreeWalk tw = new TreeWalk(or)) {
  275. final MutableObjectId id = new MutableObjectId();
  276. tw.reset(rw.parseTree(repo.resolve(Constants.HEAD)));
  277. tw.setRecursive(true);
  278. while (tw.next()) {
  279. FileMode fm = tw.getFileMode(0);
  280. if (!FileMode.REGULAR_FILE.equals(fm)
  281. && !FileMode.EXECUTABLE_FILE.equals(fm))
  282. continue;
  283. byte[] raw;
  284. try {
  285. tw.getObjectId(id, 0);
  286. raw = or.open(id).getCachedBytes(textLimit * 1024);
  287. } catch (LargeObjectException tooBig) {
  288. continue;
  289. }
  290. if (RawText.isBinary(raw))
  291. continue;
  292. RawText txt = new RawText(raw);
  293. int[] lines = new int[txt.size()];
  294. int cnt = 0;
  295. HashSet<Line> u = new HashSet<>();
  296. for (int i = 0; i < txt.size(); i++) {
  297. if (u.add(new Line(txt, i)))
  298. lines[cnt++] = i;
  299. }
  300. fileCnt++;
  301. lineCnt += cnt;
  302. for (Function fun : all)
  303. testOne(fun, txt, lines, cnt);
  304. }
  305. }
  306. File directory = repo.getDirectory();
  307. if (directory != null) {
  308. String name = directory.getName();
  309. File parent = directory.getParentFile();
  310. if (name.equals(Constants.DOT_GIT) && parent != null)
  311. name = parent.getName();
  312. outw.println(name + ":"); //$NON-NLS-1$
  313. }
  314. outw.format(" %6d files; %5d avg. unique lines/file\n", //$NON-NLS-1$
  315. valueOf(fileCnt), //
  316. valueOf(lineCnt / fileCnt));
  317. outw.format("%-20s %-15s %9s\n", "Hash", "Fold", "Max Len"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
  318. outw.println("-----------------------------------------------"); //$NON-NLS-1$
  319. String lastHashName = null;
  320. for (Function fun : all) {
  321. String hashName = fun.hash.name;
  322. if (hashName.equals(lastHashName))
  323. hashName = ""; //$NON-NLS-1$
  324. outw.format("%-20s %-15s %9d\n", // //$NON-NLS-1$
  325. hashName, //
  326. fun.fold.name, //
  327. valueOf(fun.maxChainLength));
  328. lastHashName = fun.hash.name;
  329. }
  330. outw.println();
  331. outw.flush();
  332. }
  333. private static void testOne(Function fun, RawText txt, int[] elements,
  334. int cnt) {
  335. final Hash cmp = fun.hash;
  336. final Fold fold = fun.fold;
  337. final int bits = tableBits(cnt);
  338. final int[] buckets = new int[1 << bits];
  339. for (int i = 0; i < cnt; i++)
  340. buckets[fold.fold(cmp.hash(txt, elements[i]), bits)]++;
  341. int maxChainLength = 0;
  342. for (int i = 0; i < buckets.length; i++)
  343. maxChainLength = Math.max(maxChainLength, buckets[i]);
  344. fun.maxChainLength = Math.max(fun.maxChainLength, maxChainLength);
  345. }
  346. private List<Function> init() {
  347. List<Hash> hashes = new ArrayList<>();
  348. List<Fold> folds = new ArrayList<>();
  349. try {
  350. for (Field f : TextHashFunctions.class.getDeclaredFields()) {
  351. if (f.getType() == Hash.class) {
  352. f.setAccessible(true);
  353. Hash cmp = (Hash) f.get(this);
  354. cmp.name = f.getName();
  355. hashes.add(cmp);
  356. } else if (f.getType() == Fold.class) {
  357. f.setAccessible(true);
  358. Fold fold = (Fold) f.get(this);
  359. fold.name = f.getName();
  360. folds.add(fold);
  361. }
  362. }
  363. } catch (IllegalArgumentException e) {
  364. throw new RuntimeException("Cannot determine names", e); //$NON-NLS-1$
  365. } catch (IllegalAccessException e) {
  366. throw new RuntimeException("Cannot determine names", e); //$NON-NLS-1$
  367. }
  368. List<Function> all = new ArrayList<>();
  369. for (Hash cmp : hashes) {
  370. if (include(cmp.name, hashFunctions)) {
  371. for (Fold f : folds) {
  372. if (include(f.name, foldFunctions)) {
  373. all.add(new Function(cmp, f));
  374. }
  375. }
  376. }
  377. }
  378. return all;
  379. }
  380. private static boolean include(String name, List<String> want) {
  381. if (want.isEmpty())
  382. return true;
  383. for (String s : want) {
  384. if (s.equalsIgnoreCase(name))
  385. return true;
  386. }
  387. return false;
  388. }
  389. private static class Function {
  390. final Hash hash;
  391. final Fold fold;
  392. int maxChainLength;
  393. Function(Hash cmp, Fold fold) {
  394. this.hash = cmp;
  395. this.fold = fold;
  396. }
  397. }
  398. /** Base class for any hashCode function to be tested. */
  399. private static abstract class Hash extends RawTextComparator {
  400. String name;
  401. @Override
  402. public boolean equals(RawText a, int ai, RawText b, int bi) {
  403. return RawTextComparator.DEFAULT.equals(a, ai, b, bi);
  404. }
  405. }
  406. /** Base class for any hashCode folding function to be tested. */
  407. private static abstract class Fold {
  408. String name;
  409. /**
  410. * Fold the given 32-bit hash code into only {@code bits} of space.
  411. *
  412. * @param hash
  413. * the 32 bit hash code to be folded into a smaller value.
  414. * @param bits
  415. * total number of bits that can appear in the output. The
  416. * output value must be in the range {@code [0, 1 << bits)}.
  417. * When bits = 2, valid outputs are 0, 1, 2, 3.
  418. * @return the folded hash, squeezed into only {@code bits}.
  419. */
  420. abstract int fold(int hash, int bits);
  421. }
  422. /** Utility to help us identify unique lines in a file. */
  423. private class Line {
  424. private final RawText txt;
  425. private final int pos;
  426. Line(RawText txt, int pos) {
  427. this.txt = txt;
  428. this.pos = pos;
  429. }
  430. @Override
  431. public int hashCode() {
  432. return RawTextComparator.DEFAULT.hash(txt, pos);
  433. }
  434. @Override
  435. public boolean equals(Object obj) {
  436. if (obj instanceof Line) {
  437. Line e = (Line) obj;
  438. return RawTextComparator.DEFAULT.equals(txt, pos, e.txt, e.pos);
  439. }
  440. return false;
  441. }
  442. }
  443. private static int tableBits(final int sz) {
  444. int bits = 31 - Integer.numberOfLeadingZeros(sz);
  445. if (bits == 0)
  446. bits = 1;
  447. if (1 << bits < sz)
  448. bits++;
  449. return bits;
  450. }
  451. }