From 31e3cb4375f92e56f27b83c4583523c14a712b2d Mon Sep 17 00:00:00 2001
From: Gal Paikin <paiking@google.com>
Date: Mon, 7 Dec 2020 15:18:34 +0100
Subject: Compare getting all refs except specific refs with seek and with
 filter

There are currently two ways to get all refs except a specific ref, we
add two methods that perform both and compare the two different approaches.

This change adds two methods that compares the two different approaches
of such query:
1. Get all the refs, and then filter by refs that don't start with the
prefix (current approach).
2. Get all refs until encountering a ref that is part of the prefix we
should exclude, skip using seekPastPrefix, and continue (new approach).
This works since the refs are sorted.

Specifically in Gerrit, we often have thousands of refs that are not
refs/changes, and millions of refs/changes, hence the second approach
should be much faster. In Jgit in general it's still expected to provide
a better result even if we're skipping a smaller chunk of the refs
since the complexity here is O(logn) with a binary search, rather than
O(number of skipped refs).

We ran this benchmark on a big chunk of chromium/src's reftable. To run
it, we first create the reftable:

  git ls-remote https://chromium.googlesource.com/chromium/src > lsr

  bazel build org.eclipse.jgit.pgm:jgit && rm -rf /tmp/reftable* && \
    ./bazel-bin/org.eclipse.jgit.pgm/jgit debug-benchmark-reftable \
    --test write_stack lsr /tmp/reftable

Then, we actually test the created reftable. Note that we can't test all
of them at once since there are multiple ones, but below is a good
example.

bazel build org.eclipse.jgit.pgm:jgit  && \
./bazel-bin/org.eclipse.jgit.pgm/jgit debug-benchmark-reftable \
--test get_refs_excluding_ref --ref refs/changes \
lsr /tmp/reftable/000000000001-0000001e0371.ref

Result:
total time the action took using seek:      36925 usec
total time the action took using filter:     874382 usec
number of refs that start with prefix: 4266.
number of refs that don't start with prefix: 1962695.

Similarly for Android's biggest repository, platform/frameworks/base
(still only partial result):
total time the action took using seek:       9020 usec
total time the action took using filter:     143166 usec
number of refs that start with prefix: 296.
number of refs that don't start with prefix: 60400.

In conclusion, it's easy to see an improvement of a factor of 15-20x for
large Gerrit repositories!

Signed-off-by: Gal Paikin <paiking@google.com>
Change-Id: I36d9b63eb259804c774864429cf2c761cd099cc3
---
 .../eclipse/jgit/pgm/debug/BenchmarkReftable.java  | 54 +++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/org.eclipse.jgit.pgm/src/org/eclipse/jgit/pgm/debug/BenchmarkReftable.java b/org.eclipse.jgit.pgm/src/org/eclipse/jgit/pgm/debug/BenchmarkReftable.java
index 630fac549e..f23f4cf0ea 100644
--- a/org.eclipse.jgit.pgm/src/org/eclipse/jgit/pgm/debug/BenchmarkReftable.java
+++ b/org.eclipse.jgit.pgm/src/org/eclipse/jgit/pgm/debug/BenchmarkReftable.java
@@ -23,7 +23,9 @@ import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.util.ArrayList;
 import java.util.List;
+import java.util.stream.Collectors;
 
 import org.eclipse.jgit.internal.storage.file.FileReftableStack;
 import org.eclipse.jgit.internal.storage.io.BlockSource;
@@ -47,6 +49,7 @@ class BenchmarkReftable extends TextBuiltin {
 		SEEK_COLD, SEEK_HOT,
 		BY_ID_COLD, BY_ID_HOT,
 		WRITE_STACK,
+		GET_REFS_EXCLUDING_REF
 	}
 
 	@Option(name = "--tries")
@@ -91,7 +94,11 @@ class BenchmarkReftable extends TextBuiltin {
 		case WRITE_STACK:
 			writeStack();
 			break;
-		}
+		case GET_REFS_EXCLUDING_REF :
+			getRefsExcludingWithSeekPast(ref);
+			getRefsExcludingWithFilter(ref);
+			break;
+	}
 	}
 
 	private void printf(String fmt, Object... args) throws IOException {
@@ -315,4 +322,49 @@ class BenchmarkReftable extends TextBuiltin {
 		printf("%12s %10d usec  %9.1f usec/run  %5d runs", "reftable",
 				tot / 1000, (((double) tot) / tries) / 1000, tries);
 	}
+
+	@SuppressWarnings({"nls", "boxing"})
+	private void getRefsExcludingWithFilter(String prefix) throws Exception {
+		long startTime = System.nanoTime();
+		List<Ref> allRefs = new ArrayList<>();
+		try (FileInputStream in = new FileInputStream(reftablePath);
+				BlockSource src = BlockSource.from(in);
+				ReftableReader reader = new ReftableReader(src)) {
+			try (RefCursor rc = reader.allRefs()) {
+				while (rc.next()) {
+					allRefs.add(rc.getRef());
+				}
+			}
+		}
+		int total = allRefs.size();
+		allRefs = allRefs.stream().filter(r -> r.getName().startsWith(prefix)).collect(Collectors.toList());
+		int notStartWithPrefix = allRefs.size();
+		int startWithPrefix = total - notStartWithPrefix;
+		long totalTime = System.nanoTime() - startTime;
+		printf("total time the action took using filter: %10d usec", totalTime / 1000);
+		printf("number of refs that start with prefix: %d", startWithPrefix);
+		printf("number of refs that don't start with prefix: %d", notStartWithPrefix);
+	}
+
+	@SuppressWarnings({"nls", "boxing"})
+	private void getRefsExcludingWithSeekPast(String prefix) throws Exception {
+		long start = System.nanoTime();
+		try (FileInputStream in = new FileInputStream(reftablePath);
+				BlockSource src = BlockSource.from(in);
+				ReftableReader reader = new ReftableReader(src)) {
+			try (RefCursor rc = reader.allRefs()) {
+				while (rc.next()) {
+					if (rc.getRef().getName().startsWith(prefix)) {
+						break;
+					}
+				}
+				rc.seekPastPrefix(prefix);
+				while (rc.next()) {
+					rc.getRef();
+				}
+			}
+		}
+		long tot = System.nanoTime() - start;
+		printf("total time the action took using seek: %10d usec", tot / 1000);
+	}
 }
-- 
cgit v1.2.3