diff options
author | XenoAmess <xenoamess@gmail.com> | 2024-09-24 04:18:06 +0800 |
---|---|---|
committer | Thomas Wolf <twolf@apache.org> | 2024-11-13 21:44:55 +0100 |
commit | 6cf5d194f528428f94bda3e2a1a8468244008c81 (patch) | |
tree | be9917c89c1a6face1dfd81ac54fb20a05f2fd2d /org.eclipse.jgit.benchmarks/src | |
parent | bf48fb73a211b020551d574a1ab50138697b7160 (diff) | |
download | jgit-6cf5d194f528428f94bda3e2a1a8468244008c81.tar.gz jgit-6cf5d194f528428f94bda3e2a1a8468244008c81.zip |
RawText: improve performance of isCrLfText and isBinary
Inline the function isBinary(byte, byte), and reduce several duplicated
checks in it, for better performance.
Change-Id: Ida855ed4fd7456d8fb7ed68f3af2dbfa0e25897c
Diffstat (limited to 'org.eclipse.jgit.benchmarks/src')
-rw-r--r-- | org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java | 454 |
1 files changed, 454 insertions, 0 deletions
diff --git a/org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java b/org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java new file mode 100644 index 0000000000..19297ebebb --- /dev/null +++ b/org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java @@ -0,0 +1,454 @@ +/* + * Copyright (C) 2022, Matthias Sohn <matthias.sohn@sap.com> and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.benchmarks; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +import java.util.concurrent.TimeUnit; + +import static org.eclipse.jgit.diff.RawText.getBufferSize; +import static org.eclipse.jgit.diff.RawText.isBinary; +import static org.eclipse.jgit.diff.RawText.isCrLfText; + +@State(Scope.Thread) +public class RawTextBenchmark { + + @State(Scope.Benchmark) + public static class BenchmarkState { + + @Param({"1", "2", "3", "4", "5", "6"}) + int testIndex; + + @Param({"false", "true"}) + boolean complete; + + byte[] bytes; + + @Setup + public void setupBenchmark() { + switch (testIndex) { + case 1: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + bytes = tmpBytes; + break; + } + case 2: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + byte[] tmpBytes2 = new byte[tmpBytes.length + 1]; + System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length); + tmpBytes2[500] = '\0'; + tmpBytes2[tmpBytes.length] = '\0'; + bytes = tmpBytes2; + break; + } + case 3: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + byte[] tmpBytes2 = new byte[tmpBytes.length + 1]; + System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length); + tmpBytes2[500] = '\r'; + tmpBytes2[tmpBytes.length] = '\r'; + bytes = tmpBytes2; + break; + } + case 4: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + byte[] tmpBytes2 = new byte[tmpBytes.length + 1]; + System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length); + tmpBytes2[499] = '\r'; + tmpBytes2[500] = '\n'; + tmpBytes2[tmpBytes.length - 1] = '\r'; + tmpBytes2[tmpBytes.length] = '\n'; + bytes = tmpBytes2; + break; + } + case 5: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + tmpBytes[0] = '\0'; + bytes = tmpBytes; + break; + } + case 6: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + tmpBytes[0] = '\r'; + bytes = tmpBytes; + break; + } + default: + } + } + + @TearDown + public void teardown() { + } + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsCrLfTextOld(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isCrLfTextOld( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsCrLfTextNewCandidate1(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isCrLfTextNewCandidate1( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsCrLfTextNewCandidate2(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isCrLfTextNewCandidate2( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsCrLfTextNewCandidate3(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isCrLfTextNewCandidate3( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsCrLfTextNew(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isCrLfText( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsBinaryOld(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isBinaryOld( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsBinaryNew(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isBinary( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + + /** + * Determine heuristically whether a byte array represents binary (as + * opposed to text) content. + * + * @param raw + * the raw file content. + * @param length + * number of bytes in {@code raw} to evaluate. This should be + * {@code raw.length} unless {@code raw} was over-allocated by + * the caller. + * @param complete + * whether {@code raw} contains the whole data + * @return true if raw is likely to be a binary file, false otherwise + * @since 6.0 + */ + public static boolean isBinaryOld(byte[] raw, int length, boolean complete) { + // Similar heuristic as C Git. Differences: + // - limited buffer size; may be only the beginning of a large blob + // - no counting of printable vs. non-printable bytes < 0x20 and 0x7F + int maxLength = getBufferSize(); + boolean isComplete = complete; + if (length > maxLength) { + // We restrict the length in all cases to getBufferSize() to get + // predictable behavior. Sometimes we load streams, and sometimes we + // have the full data in memory. With streams, we never look at more + // than the first getBufferSize() bytes. If we looked at more when + // we have the full data, different code paths in JGit might come to + // different conclusions. + length = maxLength; + isComplete = false; + } + byte last = 'x'; // Just something inconspicuous. + for (int ptr = 0; ptr < length; ptr++) { + byte curr = raw[ptr]; + if (isBinary(curr, last)) { + return true; + } + last = curr; + } + if (isComplete) { + // Buffer contains everything... + return last == '\r'; // ... so this must be a lone CR + } + return false; + } + + /** + * Determine heuristically whether a byte array represents text content + * using CR-LF as line separator. + * + * @param raw the raw file content. + * @param length number of bytes in {@code raw} to evaluate. + * @param complete whether {@code raw} contains the whole data + * @return {@code true} if raw is likely to be CR-LF delimited text, + * {@code false} otherwise + * @since 6.0 + */ + public static boolean isCrLfTextOld(byte[] raw, int length, boolean complete) { + boolean has_crlf = false; + byte last = 'x'; // Just something inconspicuous + for (int ptr = 0; ptr < length; ptr++) { + byte curr = raw[ptr]; + if (isBinary(curr, last)) { + return false; + } + if (curr == '\n' && last == '\r') { + has_crlf = true; + } + last = curr; + } + if (last == '\r') { + if (complete) { + // Lone CR: it's binary after all. + return false; + } + // Tough call. If the next byte, which we don't have, would be a + // '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide + // based on what we already scanned; it wasn't binary until now. + } + return has_crlf; + } + + /** + * Determine heuristically whether a byte array represents text content + * using CR-LF as line separator. + * + * @param raw + * the raw file content. + * @param length + * number of bytes in {@code raw} to evaluate. + * @return {@code true} if raw is likely to be CR-LF delimited text, + * {@code false} otherwise + * @param complete + * whether {@code raw} contains the whole data + * @since 6.0 + */ + public static boolean isCrLfTextNewCandidate1(byte[] raw, int length, boolean complete) { + boolean has_crlf = false; + + // first detect empty + if (length <= 0) { + return false; + } + + // next detect '\0' + for (int reversePtr = length - 1; reversePtr >= 0; --reversePtr) { + if (raw[reversePtr] == '\0') { + return false; + } + } + + // if '\r' be last, then if complete then return non-crlf + if (raw[length - 1] == '\r' && complete) { + return false; + } + + for (int ptr = 0; ptr < length - 1; ptr++) { + byte curr = raw[ptr]; + if (curr == '\r') { + byte next = raw[ptr + 1]; + if (next != '\n') { + return false; + } + // else + // we have crlf here + has_crlf = true; + // as next is '\n', it can never be '\r', just skip it from next check + ++ptr; + } + } + + return has_crlf; + } + + /** + * Determine heuristically whether a byte array represents text content + * using CR-LF as line separator. + * + * @param raw + * the raw file content. + * @param length + * number of bytes in {@code raw} to evaluate. + * @return {@code true} if raw is likely to be CR-LF delimited text, + * {@code false} otherwise + * @param complete + * whether {@code raw} contains the whole data + * @since 6.0 + */ + public static boolean isCrLfTextNewCandidate2(byte[] raw, int length, boolean complete) { + boolean has_crlf = false; + + // first detect empty + if (length <= 0) { + return false; + } + + // if '\r' be last, then if complete then return non-crlf + byte last = raw[length - 1]; + if (last == '\0' || last == '\r' && complete) { + return false; + } + + for (int ptr = 0; ptr < length - 1; ptr++) { + byte b = raw[ptr]; + switch (b) { + case '\0': + return false; + case '\r': { + ++ptr; + b = raw[ptr]; + if (b != '\n') { + return false; + } + // else + // we have crlf here + has_crlf = true; + // as next is '\n', it can never be '\r', just skip it from next check + break; + } + default: + // do nothing; + break; + } + } + + return has_crlf; + } + + /** + * Determine heuristically whether a byte array represents text content + * using CR-LF as line separator. + * + * @param raw + * the raw file content. + * @param length + * number of bytes in {@code raw} to evaluate. + * @return {@code true} if raw is likely to be CR-LF delimited text, + * {@code false} otherwise + * @param complete + * whether {@code raw} contains the whole data + * @since 6.0 + */ + public static boolean isCrLfTextNewCandidate3(byte[] raw, int length, boolean complete) { + boolean has_crlf = false; + + int ptr = -1; + byte current; + while (ptr < length - 2) { + current = raw[++ptr]; + if ('\0' == current || '\r' == current && (raw[++ptr] != '\n' || !(has_crlf = true))) { + return false; + } + } + + if (ptr == length - 2) { + // if '\r' be last, then if isComplete then return binary + current = raw[++ptr]; + if('\0' == current || '\r' == current && complete){ + return false; + } + } + + return has_crlf; + } + + + public static void main(String[] args) throws RunnerException { + Options opt = new OptionsBuilder() + .include(RawTextBenchmark.class.getSimpleName()) + .forks(1).jvmArgs("-ea").build(); + new Runner(opt).run(); + } +} |