From 6cf5d194f528428f94bda3e2a1a8468244008c81 Mon Sep 17 00:00:00 2001 From: XenoAmess Date: Tue, 24 Sep 2024 04:18:06 +0800 Subject: [PATCH] RawText: improve performance of isCrLfText and isBinary Inline the function isBinary(byte, byte), and reduce several duplicated checks in it, for better performance. Change-Id: Ida855ed4fd7456d8fb7ed68f3af2dbfa0e25897c --- .../jgit/benchmarks/RawTextBenchmark.java | 454 ++++++++++++++++++ .../src/org/eclipse/jgit/diff/RawText.java | 49 +- 2 files changed, 483 insertions(+), 20 deletions(-) create mode 100644 org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java diff --git a/org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java b/org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java new file mode 100644 index 0000000000..19297ebebb --- /dev/null +++ b/org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java @@ -0,0 +1,454 @@ +/* + * Copyright (C) 2022, Matthias Sohn and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.benchmarks; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +import java.util.concurrent.TimeUnit; + +import static org.eclipse.jgit.diff.RawText.getBufferSize; +import static org.eclipse.jgit.diff.RawText.isBinary; +import static org.eclipse.jgit.diff.RawText.isCrLfText; + +@State(Scope.Thread) +public class RawTextBenchmark { + + @State(Scope.Benchmark) + public static class BenchmarkState { + + @Param({"1", "2", "3", "4", "5", "6"}) + int testIndex; + + @Param({"false", "true"}) + boolean complete; + + byte[] bytes; + + @Setup + public void setupBenchmark() { + switch (testIndex) { + case 1: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + bytes = tmpBytes; + break; + } + case 2: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + byte[] tmpBytes2 = new byte[tmpBytes.length + 1]; + System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length); + tmpBytes2[500] = '\0'; + tmpBytes2[tmpBytes.length] = '\0'; + bytes = tmpBytes2; + break; + } + case 3: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + byte[] tmpBytes2 = new byte[tmpBytes.length + 1]; + System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length); + tmpBytes2[500] = '\r'; + tmpBytes2[tmpBytes.length] = '\r'; + bytes = tmpBytes2; + break; + } + case 4: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + byte[] tmpBytes2 = new byte[tmpBytes.length + 1]; + System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length); + tmpBytes2[499] = '\r'; + tmpBytes2[500] = '\n'; + tmpBytes2[tmpBytes.length - 1] = '\r'; + tmpBytes2[tmpBytes.length] = '\n'; + bytes = tmpBytes2; + break; + } + case 5: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + tmpBytes[0] = '\0'; + bytes = tmpBytes; + break; + } + case 6: { + byte[] tmpBytes = "a".repeat(102400).getBytes(); + tmpBytes[0] = '\r'; + bytes = tmpBytes; + break; + } + default: + } + } + + @TearDown + public void teardown() { + } + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsCrLfTextOld(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isCrLfTextOld( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsCrLfTextNewCandidate1(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isCrLfTextNewCandidate1( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsCrLfTextNewCandidate2(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isCrLfTextNewCandidate2( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsCrLfTextNewCandidate3(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isCrLfTextNewCandidate3( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsCrLfTextNew(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isCrLfText( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsBinaryOld(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isBinaryOld( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + + @Benchmark + @BenchmarkMode({Mode.AverageTime}) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(1) + public void testIsBinaryNew(Blackhole blackhole, BenchmarkState state) { + blackhole.consume( + isBinary( + state.bytes, + state.bytes.length, + state.complete + ) + ); + } + + + /** + * Determine heuristically whether a byte array represents binary (as + * opposed to text) content. + * + * @param raw + * the raw file content. + * @param length + * number of bytes in {@code raw} to evaluate. This should be + * {@code raw.length} unless {@code raw} was over-allocated by + * the caller. + * @param complete + * whether {@code raw} contains the whole data + * @return true if raw is likely to be a binary file, false otherwise + * @since 6.0 + */ + public static boolean isBinaryOld(byte[] raw, int length, boolean complete) { + // Similar heuristic as C Git. Differences: + // - limited buffer size; may be only the beginning of a large blob + // - no counting of printable vs. non-printable bytes < 0x20 and 0x7F + int maxLength = getBufferSize(); + boolean isComplete = complete; + if (length > maxLength) { + // We restrict the length in all cases to getBufferSize() to get + // predictable behavior. Sometimes we load streams, and sometimes we + // have the full data in memory. With streams, we never look at more + // than the first getBufferSize() bytes. If we looked at more when + // we have the full data, different code paths in JGit might come to + // different conclusions. + length = maxLength; + isComplete = false; + } + byte last = 'x'; // Just something inconspicuous. + for (int ptr = 0; ptr < length; ptr++) { + byte curr = raw[ptr]; + if (isBinary(curr, last)) { + return true; + } + last = curr; + } + if (isComplete) { + // Buffer contains everything... + return last == '\r'; // ... so this must be a lone CR + } + return false; + } + + /** + * Determine heuristically whether a byte array represents text content + * using CR-LF as line separator. + * + * @param raw the raw file content. + * @param length number of bytes in {@code raw} to evaluate. + * @param complete whether {@code raw} contains the whole data + * @return {@code true} if raw is likely to be CR-LF delimited text, + * {@code false} otherwise + * @since 6.0 + */ + public static boolean isCrLfTextOld(byte[] raw, int length, boolean complete) { + boolean has_crlf = false; + byte last = 'x'; // Just something inconspicuous + for (int ptr = 0; ptr < length; ptr++) { + byte curr = raw[ptr]; + if (isBinary(curr, last)) { + return false; + } + if (curr == '\n' && last == '\r') { + has_crlf = true; + } + last = curr; + } + if (last == '\r') { + if (complete) { + // Lone CR: it's binary after all. + return false; + } + // Tough call. If the next byte, which we don't have, would be a + // '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide + // based on what we already scanned; it wasn't binary until now. + } + return has_crlf; + } + + /** + * Determine heuristically whether a byte array represents text content + * using CR-LF as line separator. + * + * @param raw + * the raw file content. + * @param length + * number of bytes in {@code raw} to evaluate. + * @return {@code true} if raw is likely to be CR-LF delimited text, + * {@code false} otherwise + * @param complete + * whether {@code raw} contains the whole data + * @since 6.0 + */ + public static boolean isCrLfTextNewCandidate1(byte[] raw, int length, boolean complete) { + boolean has_crlf = false; + + // first detect empty + if (length <= 0) { + return false; + } + + // next detect '\0' + for (int reversePtr = length - 1; reversePtr >= 0; --reversePtr) { + if (raw[reversePtr] == '\0') { + return false; + } + } + + // if '\r' be last, then if complete then return non-crlf + if (raw[length - 1] == '\r' && complete) { + return false; + } + + for (int ptr = 0; ptr < length - 1; ptr++) { + byte curr = raw[ptr]; + if (curr == '\r') { + byte next = raw[ptr + 1]; + if (next != '\n') { + return false; + } + // else + // we have crlf here + has_crlf = true; + // as next is '\n', it can never be '\r', just skip it from next check + ++ptr; + } + } + + return has_crlf; + } + + /** + * Determine heuristically whether a byte array represents text content + * using CR-LF as line separator. + * + * @param raw + * the raw file content. + * @param length + * number of bytes in {@code raw} to evaluate. + * @return {@code true} if raw is likely to be CR-LF delimited text, + * {@code false} otherwise + * @param complete + * whether {@code raw} contains the whole data + * @since 6.0 + */ + public static boolean isCrLfTextNewCandidate2(byte[] raw, int length, boolean complete) { + boolean has_crlf = false; + + // first detect empty + if (length <= 0) { + return false; + } + + // if '\r' be last, then if complete then return non-crlf + byte last = raw[length - 1]; + if (last == '\0' || last == '\r' && complete) { + return false; + } + + for (int ptr = 0; ptr < length - 1; ptr++) { + byte b = raw[ptr]; + switch (b) { + case '\0': + return false; + case '\r': { + ++ptr; + b = raw[ptr]; + if (b != '\n') { + return false; + } + // else + // we have crlf here + has_crlf = true; + // as next is '\n', it can never be '\r', just skip it from next check + break; + } + default: + // do nothing; + break; + } + } + + return has_crlf; + } + + /** + * Determine heuristically whether a byte array represents text content + * using CR-LF as line separator. + * + * @param raw + * the raw file content. + * @param length + * number of bytes in {@code raw} to evaluate. + * @return {@code true} if raw is likely to be CR-LF delimited text, + * {@code false} otherwise + * @param complete + * whether {@code raw} contains the whole data + * @since 6.0 + */ + public static boolean isCrLfTextNewCandidate3(byte[] raw, int length, boolean complete) { + boolean has_crlf = false; + + int ptr = -1; + byte current; + while (ptr < length - 2) { + current = raw[++ptr]; + if ('\0' == current || '\r' == current && (raw[++ptr] != '\n' || !(has_crlf = true))) { + return false; + } + } + + if (ptr == length - 2) { + // if '\r' be last, then if isComplete then return binary + current = raw[++ptr]; + if('\0' == current || '\r' == current && complete){ + return false; + } + } + + return has_crlf; + } + + + public static void main(String[] args) throws RunnerException { + Options opt = new OptionsBuilder() + .include(RawTextBenchmark.class.getSimpleName()) + .forks(1).jvmArgs("-ea").build(); + new Runner(opt).run(); + } +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java index 76dc87e72b..5a4d2aa45a 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java @@ -360,18 +360,22 @@ public class RawText extends Sequence { length = maxLength; isComplete = false; } - byte last = 'x'; // Just something inconspicuous. - for (int ptr = 0; ptr < length; ptr++) { - byte curr = raw[ptr]; - if (isBinary(curr, last)) { + + int ptr = -1; + byte current; + while (ptr < length - 2) { + current = raw[++ptr]; + if ('\0' == current || '\r' == current && raw[++ptr] != '\n') { return true; } - last = curr; } - if (isComplete) { - // Buffer contains everything... - return last == '\r'; // ... so this must be a lone CR + + if (ptr == length - 2) { + // if '\r' be last, then if isComplete then return binary + current = raw[++ptr]; + return '\0' == current || '\r' == current && isComplete; } + return false; } @@ -467,26 +471,30 @@ public class RawText extends Sequence { */ public static boolean isCrLfText(byte[] raw, int length, boolean complete) { boolean has_crlf = false; - byte last = 'x'; // Just something inconspicuous - for (int ptr = 0; ptr < length; ptr++) { - byte curr = raw[ptr]; - if (isBinary(curr, last)) { + + int ptr = -1; + byte current; + while (ptr < length - 2) { + current = raw[++ptr]; + if (current == '\0') { return false; } - if (curr == '\n' && last == '\r') { + if (current == '\r') { + if (raw[++ptr] != '\n') { + return false; + } has_crlf = true; } - last = curr; } - if (last == '\r') { - if (complete) { - // Lone CR: it's binary after all. + + if (ptr == length - 2) { + // if '\r' be last, then if isComplete then return binary + current = raw[++ptr]; + if('\0' == current || '\r' == current && complete){ return false; } - // Tough call. If the next byte, which we don't have, would be a - // '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide - // based on what we already scanned; it wasn't binary until now. } + return has_crlf; } @@ -578,4 +586,5 @@ public class RawText extends Sequence { return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz)); } } + } -- 2.39.5