Merge changes Ife0cc2da,If38507ef

* changes: Speed up PathFilterGroup for large sets of paths Add test case for PathFilterGroup
author: Robin Rosenberg <robin.rosenberg@dewire.com> 2013-01-28 19:00:00 -0500
committer: Gerrit Code Review @ Eclipse.org <gerrit@eclipse.org> 2013-01-28 19:00:00 -0500
commit: ee413067fded57d817af5b6b9978b8ceb6e6374e (patch)
tree: 76d00910ec5b361306fe224e77503f9e818a9424 /org.eclipse.jgit/src/org/eclipse/jgit/treewalk
parent: 33bc4f7c052fc7a1150f0c9671736329d8100e29 (diff)
parent: 522fc6a9c64bb28399f7ecadb239aa7a02c5a81b (diff)
download: jgit-ee413067fded57d817af5b6b9978b8ceb6e6374e.tar.gz
jgit-ee413067fded57d817af5b6b9978b8ceb6e6374e.zip
2 files changed, 387 insertions, 30 deletions
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/treewalk/filter/ByteArraySet.java b/org.eclipse.jgit/src/org/eclipse/jgit/treewalk/filter/ByteArraySet.java
new file mode 100644
index 0000000000..0df24af24f
--- /dev/null
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/treewalk/filter/ByteArraySet.java
@@ -0,0 +1,318 @@
+/*
+ * Copyright (C) 2009, Google Inc.
+ * Copyright (C) 2008, Marek Zawirski <marek.zawirski@gmail.com>
+ * Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org>
+ * Copyright (C) 2013, Robin Rosenberg
+ * and other copyright owners as documented in the project's IP log.
+ *
+ * This program and the accompanying materials are made available
+ * under the terms of the Eclipse Distribution License v1.0 which
+ * accompanies this distribution, is reproduced below, and is
+ * available at http://www.eclipse.org/org/documents/edl-v10.php
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ *   copyright notice, this list of conditions and the following
+ *   disclaimer in the documentation and/or other materials provided
+ *   with the distribution.
+ *
+ * - Neither the name of the Eclipse Foundation, Inc. nor the
+ *   names of its contributors may be used to endorse or promote
+ *   products derived from this software without specific prior
+ *   written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.eclipse.jgit.treewalk.filter;
+
+import org.eclipse.jgit.util.RawParseUtils;
+
+/**
+ * Specialized set for byte arrays, interpreted as strings for use in
+ * {@link PathFilterGroup.Group}. Most methods assume the hash is already know
+ * and therefore requires the caller to supply it beforehand. The implementation
+ * is a loose derivative of ObjectIdSubclassMap.
+ */
+class ByteArraySet {
+
+	private int size;
+
+	private int grow;
+
+	private int mask;
+
+	private byte[][] table;
+
+	/**
+	 * Create an empty set.
+	 *
+	 * @param capacity
+	 */
+	ByteArraySet(int capacity) {
+		initTable(1 << Integer.highestOneBit((capacity * 2) - 1));
+	}
+
+	private byte[] get(final byte[] toFind, int length, int hash) {
+		final int msk = mask;
+		int i = hash & msk;
+		final byte[][] tbl = table;
+		byte[] obj;
+
+		while ((obj = tbl[i]) != null) {
+			if (equals(obj, toFind, length))
+				return obj;
+			i = (i + 1) & msk;
+		}
+		return null;
+	}
+
+	private static boolean equals(byte[] a, byte[] b, int length) {
+		if (a.length < length || b.length < length)
+			return false;
+		for (int i = 0; i < length; ++i) {
+			if (a[i] != b[i])
+				return false;
+		}
+		return true;
+	}
+
+	/**
+	 * Returns true if this set contains the specified array.
+	 *
+	 * @param toFind
+	 *            array to find.
+	 * @param length
+	 *            The number of bytes in toFind that are used
+	 * @param hash
+	 *            pre-computed hash of toFind
+	 * @return true if the mapping exists for this byte array; false otherwise.
+	 */
+	boolean contains(final byte[] toFind, int length, int hash) {
+		return get(toFind, length, hash) != null;
+	}
+
+	/**
+	 * Store a byte array for future lookup.
+	 * <p>
+	 * Stores {@code newValue}, but only if it does not already exist in the
+	 * set. Callers can tell if the value is new by checking the return value
+	 * with reference equality:
+	 *
+	 * <pre>
+	 * byte[] obj = ...;
+	 * boolean wasNew = map.addIfAbsent(array, length, hash) == array;
+	 * </pre>
+	 *
+	 * @param newValue
+	 *            the array to store.
+	 * @param length
+	 *            The number of bytes in newValue that are used
+	 * @param hash
+	 *            pre-computed hash of toFind
+	 * @return {@code newValue} if stored, or the prior value already stored and
+	 *         that would have been returned had the caller used
+	 *         {@code get(newValue)} first.
+	 */
+	byte[] addIfAbsent(final byte[] newValue, int length, int hash) {
+		final int msk = mask;
+		int i = hash & msk;
+		final byte[][] tbl = table;
+		byte[] obj;
+
+		while ((obj = tbl[i]) != null) {
+			if (equals(obj, newValue, length))
+				return obj;
+			i = (i + 1) & msk;
+		}
+
+		byte[] valueToInsert = copyIfNotSameSize(newValue, length);
+		if (++size == grow) {
+			grow();
+			insert(valueToInsert, hash);
+		} else
+			tbl[i] = valueToInsert;
+		return valueToInsert;
+	}
+
+	private static byte[] copyIfNotSameSize(byte[] newValue, int length) {
+		if (newValue.length == length)
+			return newValue;
+		byte[] ret = new byte[length];
+		System.arraycopy(newValue, 0, ret, 0, length);
+		return ret;
+	}
+
+	/**
+	 * @return number of arrays in the set
+	 */
+	int size() {
+		return size;
+	}
+
+	/** @return true if {@link #size()} is 0. */
+	boolean isEmpty() {
+		return size == 0;
+	}
+
+	private void insert(final byte[] newValue, int hash) {
+		final int msk = mask;
+		int j = hash & msk;
+		final byte[][] tbl = table;
+		while (tbl[j] != null)
+			j = (j + 1) & msk;
+		tbl[j] = newValue;
+	}
+
+	private Hasher hasher = new Hasher(null, 0);
+
+	private void grow() {
+		final byte[][] oldTable = table;
+		final int oldSize = table.length;
+
+		initTable(oldSize << 1);
+		for (int i = 0; i < oldSize; i++) {
+			final byte[] obj = oldTable[i];
+			if (obj != null) {
+				hasher.init(obj, obj.length);
+				insert(obj, hasher.hash());
+			}
+		}
+	}
+
+	private void initTable(int sz) {
+		if (sz < 2)
+			sz = 2;
+		grow = sz >> 1;
+		mask = sz - 1;
+		table = new byte[sz][];
+	}
+
+	@Override
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append('[');
+		for (byte[] b : table) {
+			if (b == null)
+				continue;
+			if (sb.length() > 1)
+				sb.append(" , "); //$NON-NLS-1$
+			sb.append('"');
+			sb.append(RawParseUtils.decode(b));
+			sb.append('"');
+			sb.append('(');
+			sb.append(chainlength(b));
+			sb.append(')');
+		}
+		sb.append(']');
+		return sb.toString();
+	}
+
+	private int chainlength(byte[] b) {
+		Hasher h = new Hasher(b, b.length);
+		int hash = h.hash();
+		final int msk = mask;
+		int i = hash & msk;
+		final byte[][] tbl = table;
+		byte[] obj;
+
+		int n = 0;
+		while ((obj = tbl[i]) != null) {
+			if (equals(obj, b, b.length))
+				return n;
+			i = (i + 1) & msk;
+			++n;
+		}
+		return -1;
+	}
+
+	static class Hasher {
+		private int hash;
+
+		private int pos;
+
+		private byte[] data;
+
+		private int length;
+
+		Hasher(byte[] data, int length) {
+			init(data, length);
+		}
+
+		void init(byte[] d, int l) {
+			this.data = d;
+			this.length = l;
+			pos = 0;
+			hash = 0;
+		}
+
+		int hash() {
+			while (pos < length)
+				hash = hash * 31 + data[pos++];
+			return hash;
+		}
+
+		int nextHash() {
+			for (;;) {
+				hash = hash * 31 + data[pos];
+				++pos;
+				if (pos == length || data[pos] == '/')
+					return hash;
+			}
+		}
+
+		int getHash() {
+			return hash;
+		}
+
+		boolean hasNext() {
+			return pos < length;
+		}
+
+		public int length() {
+			return pos;
+		}
+
+		@Override
+		public String toString() {
+			StringBuilder sb = new StringBuilder();
+			for (int i = 0; i < pos; ++i)
+				sb.append((char) data[i]);
+			sb.append(" | "); //$NON-NLS-1$
+			for (int i = pos; i < length; ++i)
+				sb.append((char) data[i]);
+			return sb.toString();
+		}
+	}
+
+	byte[][] toArray() {
+		byte[][] ret = new byte[size][];
+		int i = 0;
+		for (byte[] entry : table) {
+			if (entry != null)
+				ret[i++] = entry;
+		}
+		return ret;
+	}
+
+}
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/treewalk/filter/PathFilterGroup.java b/org.eclipse.jgit/src/org/eclipse/jgit/treewalk/filter/PathFilterGroup.java
index 51761a8126..66d9f87a77 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/treewalk/filter/PathFilterGroup.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/treewalk/filter/PathFilterGroup.java
@@ -44,13 +44,13 @@
 
 package org.eclipse.jgit.treewalk.filter;
 
-import java.util.Arrays;
 import java.util.Collection;
-import java.util.Comparator;
 
 import org.eclipse.jgit.errors.StopWalkException;
 import org.eclipse.jgit.internal.JGitText;
 import org.eclipse.jgit.treewalk.TreeWalk;
+import org.eclipse.jgit.treewalk.filter.ByteArraySet.Hasher;
+import org.eclipse.jgit.util.RawParseUtils;
 
 /**
  * Includes tree entries only if they match one or more configured paths.
@@ -83,7 +83,8 @@ public class PathFilterGroup {
 	 */
 	public static TreeFilter createFromStrings(final Collection<String> paths) {
 		if (paths.isEmpty())
-			throw new IllegalArgumentException(JGitText.get().atLeastOnePathIsRequired);
+			throw new IllegalArgumentException(
+					JGitText.get().atLeastOnePathIsRequired);
 		final PathFilter[] p = new PathFilter[paths.size()];
 		int i = 0;
 		for (final String s : paths)
@@ -131,7 +132,8 @@ public class PathFilterGroup {
 	 */
 	public static TreeFilter create(final Collection<PathFilter> paths) {
 		if (paths.isEmpty())
-			throw new IllegalArgumentException(JGitText.get().atLeastOnePathIsRequired);
+			throw new IllegalArgumentException(
+					JGitText.get().atLeastOnePathIsRequired);
 		final PathFilter[] p = new PathFilter[paths.size()];
 		paths.toArray(p);
 		return create(p);
@@ -177,41 +179,74 @@ public class PathFilterGroup {
 	}
 
 	static class Group extends TreeFilter {
-		private static final Comparator<PathFilter> PATH_SORT = new Comparator<PathFilter>() {
-			public int compare(final PathFilter o1, final PathFilter o2) {
-				return o1.pathStr.compareTo(o2.pathStr);
-			}
-		};
 
-		private final PathFilter[] paths;
+		private ByteArraySet fullpaths;
+
+		private ByteArraySet prefixes;
+
+		private byte[] max;
+
+		private Group(final PathFilter[] pathFilters) {
+			fullpaths = new ByteArraySet(pathFilters.length);
+			prefixes = new ByteArraySet(pathFilters.length / 5);
+			// 5 is an empirically derived ratio of #paths/#prefixes from:
+			// egit/jgit: 8
+			// git: 5
+			// linux kernel: 13
+			// eclipse.platform.ui: 7
+			max = pathFilters[0].pathRaw;
+			Hasher hasher = new Hasher(null, 0);
+			for (PathFilter pf : pathFilters) {
+				hasher.init(pf.pathRaw, pf.pathRaw.length);
+				while (hasher.hasNext()) {
+					int hash = hasher.nextHash();
+					if (hasher.hasNext())
+						prefixes.addIfAbsent(pf.pathRaw, hasher.length(), hash);
+				}
+				fullpaths.addIfAbsent(pf.pathRaw, pf.pathRaw.length,
+						hasher.getHash());
+				if (compare(max, pf.pathRaw) < 0)
+					max = pf.pathRaw;
+			}
+		}
 
-		private Group(final PathFilter[] p) {
-			paths = p;
-			Arrays.sort(paths, PATH_SORT);
+		private static int compare(byte[] a, byte[] b) {
+			int i = 0;
+			while (i < a.length && i < b.length) {
+				int ba = a[i] & 0xFF;
+				int bb = b[i] & 0xFF;
+				int cmp = ba - bb;
+				if (cmp != 0)
+					return cmp;
+				++i;
+			}
+			return a.length - b.length;
 		}
 
 		@Override
 		public boolean include(final TreeWalk walker) {
-			final int n = paths.length;
-			for (int i = 0;;) {
-				final byte[] r = paths[i].pathRaw;
-				final int cmp = walker.isPathPrefix(r, r.length);
-				if (cmp == 0)
+
+			byte[] rp = walker.getRawPath();
+			Hasher hasher = new Hasher(rp, walker.getPathLength());
+			while (hasher.hasNext()) {
+				int hash = hasher.nextHash();
+				if (fullpaths.contains(rp, hasher.length(), hash))
 					return true;
-				if (++i < n)
-					continue;
-				if (cmp > 0)
-					throw StopWalkException.INSTANCE;
-				return false;
+				if (!hasher.hasNext())
+					if (prefixes.contains(rp, hasher.length(), hash))
+						return true;
 			}
+
+			final int cmp = walker.isPathPrefix(max, max.length);
+			if (cmp > 0)
+				throw StopWalkException.INSTANCE;
+
+			return false;
 		}
 
 		@Override
 		public boolean shouldBeRecursive() {
-			for (final PathFilter p : paths)
-				if (p.shouldBeRecursive())
-					return true;
-			return false;
+			return !prefixes.isEmpty();
 		}
 
 		@Override
@@ -222,13 +257,17 @@ public class PathFilterGroup {
 		public String toString() {
 			final StringBuilder r = new StringBuilder();
 			r.append("FAST("); //$NON-NLS-1$
-			for (int i = 0; i < paths.length; i++) {
-				if (i > 0)
+			boolean first = true;
+			for (byte[] p : fullpaths.toArray()) {
+				if (!first) {
 					r.append(" OR "); //$NON-NLS-1$
-				r.append(paths[i].toString());
+				}
+				r.append(RawParseUtils.decode(p));
+				first = false;
 			}
 			r.append(")"); //$NON-NLS-1$
 			return r.toString();
 		}
 	}
+
 }
author	Robin Rosenberg <robin.rosenberg@dewire.com>	2013-01-28 19:00:00 -0500
committer	Gerrit Code Review @ Eclipse.org <gerrit@eclipse.org>	2013-01-28 19:00:00 -0500
commit	ee413067fded57d817af5b6b9978b8ceb6e6374e (patch)
tree	76d00910ec5b361306fe224e77503f9e818a9424 /org.eclipse.jgit/src/org/eclipse/jgit/treewalk
parent	33bc4f7c052fc7a1150f0c9671736329d8100e29 (diff)
parent	522fc6a9c64bb28399f7ecadb239aa7a02c5a81b (diff)
download	jgit-ee413067fded57d817af5b6b9978b8ceb6e6374e.tar.gz jgit-ee413067fded57d817af5b6b9978b8ceb6e6374e.zip