]> source.dussan.org Git - jgit.git/commitdiff
RawText: improve performance of isCrLfText and isBinary 58/1200658/5
authorXenoAmess <xenoamess@gmail.com>
Mon, 23 Sep 2024 20:18:06 +0000 (04:18 +0800)
committerThomas Wolf <twolf@apache.org>
Wed, 13 Nov 2024 20:44:55 +0000 (21:44 +0100)
Inline the function isBinary(byte, byte), and reduce several duplicated
checks in it, for better performance.

Change-Id: Ida855ed4fd7456d8fb7ed68f3af2dbfa0e25897c

org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java [new file with mode: 0644]
org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java

diff --git a/org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java b/org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java
new file mode 100644 (file)
index 0000000..19297eb
--- /dev/null
@@ -0,0 +1,454 @@
+/*
+ * Copyright (C) 2022, Matthias Sohn <matthias.sohn@sap.com> and others
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Distribution License v. 1.0 which is available at
+ * https://www.eclipse.org/org/documents/edl-v10.php.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+package org.eclipse.jgit.benchmarks;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.util.concurrent.TimeUnit;
+
+import static org.eclipse.jgit.diff.RawText.getBufferSize;
+import static org.eclipse.jgit.diff.RawText.isBinary;
+import static org.eclipse.jgit.diff.RawText.isCrLfText;
+
+@State(Scope.Thread)
+public class RawTextBenchmark {
+
+       @State(Scope.Benchmark)
+       public static class BenchmarkState {
+
+               @Param({"1", "2", "3", "4", "5", "6"})
+               int testIndex;
+
+               @Param({"false", "true"})
+               boolean complete;
+
+               byte[] bytes;
+
+               @Setup
+               public void setupBenchmark() {
+                       switch (testIndex) {
+                               case 1: {
+                                       byte[] tmpBytes = "a".repeat(102400).getBytes();
+                                       bytes = tmpBytes;
+                                       break;
+                               }
+                               case 2: {
+                                       byte[] tmpBytes = "a".repeat(102400).getBytes();
+                                       byte[] tmpBytes2 = new byte[tmpBytes.length + 1];
+                                       System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length);
+                                       tmpBytes2[500] = '\0';
+                                       tmpBytes2[tmpBytes.length] = '\0';
+                                       bytes = tmpBytes2;
+                                       break;
+                               }
+                               case 3: {
+                                       byte[] tmpBytes = "a".repeat(102400).getBytes();
+                                       byte[] tmpBytes2 = new byte[tmpBytes.length + 1];
+                                       System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length);
+                                       tmpBytes2[500] = '\r';
+                                       tmpBytes2[tmpBytes.length] = '\r';
+                                       bytes = tmpBytes2;
+                                       break;
+                               }
+                               case 4: {
+                                       byte[] tmpBytes = "a".repeat(102400).getBytes();
+                                       byte[] tmpBytes2 = new byte[tmpBytes.length + 1];
+                                       System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length);
+                                       tmpBytes2[499] = '\r';
+                                       tmpBytes2[500] = '\n';
+                                       tmpBytes2[tmpBytes.length - 1] = '\r';
+                                       tmpBytes2[tmpBytes.length] = '\n';
+                                       bytes = tmpBytes2;
+                                       break;
+                               }
+                               case 5: {
+                                       byte[] tmpBytes = "a".repeat(102400).getBytes();
+                                       tmpBytes[0] = '\0';
+                                       bytes = tmpBytes;
+                                       break;
+                               }
+                               case 6: {
+                                       byte[] tmpBytes = "a".repeat(102400).getBytes();
+                                       tmpBytes[0] = '\r';
+                                       bytes = tmpBytes;
+                                       break;
+                               }
+                               default:
+                       }
+               }
+
+               @TearDown
+               public void teardown() {
+               }
+       }
+
+       @Benchmark
+       @BenchmarkMode({Mode.AverageTime})
+       @OutputTimeUnit(TimeUnit.NANOSECONDS)
+       @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Fork(1)
+       public void testIsCrLfTextOld(Blackhole blackhole, BenchmarkState state) {
+               blackhole.consume(
+                               isCrLfTextOld(
+                                               state.bytes,
+                                               state.bytes.length,
+                                               state.complete
+                               )
+               );
+       }
+
+       @Benchmark
+       @BenchmarkMode({Mode.AverageTime})
+       @OutputTimeUnit(TimeUnit.NANOSECONDS)
+       @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Fork(1)
+       public void testIsCrLfTextNewCandidate1(Blackhole blackhole, BenchmarkState state) {
+               blackhole.consume(
+                               isCrLfTextNewCandidate1(
+                                               state.bytes,
+                                               state.bytes.length,
+                                               state.complete
+                               )
+               );
+       }
+
+       @Benchmark
+       @BenchmarkMode({Mode.AverageTime})
+       @OutputTimeUnit(TimeUnit.NANOSECONDS)
+       @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Fork(1)
+       public void testIsCrLfTextNewCandidate2(Blackhole blackhole, BenchmarkState state) {
+               blackhole.consume(
+                               isCrLfTextNewCandidate2(
+                                               state.bytes,
+                                               state.bytes.length,
+                                               state.complete
+                               )
+               );
+       }
+
+       @Benchmark
+       @BenchmarkMode({Mode.AverageTime})
+       @OutputTimeUnit(TimeUnit.NANOSECONDS)
+       @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Fork(1)
+       public void testIsCrLfTextNewCandidate3(Blackhole blackhole, BenchmarkState state) {
+               blackhole.consume(
+                               isCrLfTextNewCandidate3(
+                                               state.bytes,
+                                               state.bytes.length,
+                                               state.complete
+                               )
+               );
+       }
+
+       @Benchmark
+       @BenchmarkMode({Mode.AverageTime})
+       @OutputTimeUnit(TimeUnit.NANOSECONDS)
+       @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Fork(1)
+       public void testIsCrLfTextNew(Blackhole blackhole, BenchmarkState state) {
+               blackhole.consume(
+                               isCrLfText(
+                                               state.bytes,
+                                               state.bytes.length,
+                                               state.complete
+                               )
+               );
+       }
+
+       @Benchmark
+       @BenchmarkMode({Mode.AverageTime})
+       @OutputTimeUnit(TimeUnit.NANOSECONDS)
+       @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Fork(1)
+       public void testIsBinaryOld(Blackhole blackhole, BenchmarkState state) {
+               blackhole.consume(
+                               isBinaryOld(
+                                               state.bytes,
+                                               state.bytes.length,
+                                               state.complete
+                               )
+               );
+       }
+
+
+       @Benchmark
+       @BenchmarkMode({Mode.AverageTime})
+       @OutputTimeUnit(TimeUnit.NANOSECONDS)
+       @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+       @Fork(1)
+       public void testIsBinaryNew(Blackhole blackhole, BenchmarkState state) {
+               blackhole.consume(
+                               isBinary(
+                                               state.bytes,
+                                               state.bytes.length,
+                                               state.complete
+                               )
+               );
+       }
+
+
+       /**
+        * Determine heuristically whether a byte array represents binary (as
+        * opposed to text) content.
+        *
+        * @param raw
+        *                      the raw file content.
+        * @param length
+        *                      number of bytes in {@code raw} to evaluate. This should be
+        *                      {@code raw.length} unless {@code raw} was over-allocated by
+        *                      the caller.
+        * @param complete
+        *                      whether {@code raw} contains the whole data
+        * @return true if raw is likely to be a binary file, false otherwise
+        * @since 6.0
+        */
+       public static boolean isBinaryOld(byte[] raw, int length, boolean complete) {
+               // Similar heuristic as C Git. Differences:
+               // - limited buffer size; may be only the beginning of a large blob
+               // - no counting of printable vs. non-printable bytes < 0x20 and 0x7F
+               int maxLength = getBufferSize();
+               boolean isComplete = complete;
+               if (length > maxLength) {
+                       // We restrict the length in all cases to getBufferSize() to get
+                       // predictable behavior. Sometimes we load streams, and sometimes we
+                       // have the full data in memory. With streams, we never look at more
+                       // than the first getBufferSize() bytes. If we looked at more when
+                       // we have the full data, different code paths in JGit might come to
+                       // different conclusions.
+                       length = maxLength;
+                       isComplete = false;
+               }
+               byte last = 'x'; // Just something inconspicuous.
+               for (int ptr = 0; ptr < length; ptr++) {
+                       byte curr = raw[ptr];
+                       if (isBinary(curr, last)) {
+                               return true;
+                       }
+                       last = curr;
+               }
+               if (isComplete) {
+                       // Buffer contains everything...
+                       return last == '\r'; // ... so this must be a lone CR
+               }
+               return false;
+       }
+
+       /**
+        * Determine heuristically whether a byte array represents text content
+        * using CR-LF as line separator.
+        *
+        * @param raw     the raw file content.
+        * @param length   number of bytes in {@code raw} to evaluate.
+        * @param complete whether {@code raw} contains the whole data
+        * @return {@code true} if raw is likely to be CR-LF delimited text,
+        * {@code false} otherwise
+        * @since 6.0
+        */
+       public static boolean isCrLfTextOld(byte[] raw, int length, boolean complete) {
+               boolean has_crlf = false;
+               byte last = 'x'; // Just something inconspicuous
+               for (int ptr = 0; ptr < length; ptr++) {
+                       byte curr = raw[ptr];
+                       if (isBinary(curr, last)) {
+                               return false;
+                       }
+                       if (curr == '\n' && last == '\r') {
+                               has_crlf = true;
+                       }
+                       last = curr;
+               }
+               if (last == '\r') {
+                       if (complete) {
+                               // Lone CR: it's binary after all.
+                               return false;
+                       }
+                       // Tough call. If the next byte, which we don't have, would be a
+                       // '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide
+                       // based on what we already scanned; it wasn't binary until now.
+               }
+               return has_crlf;
+       }
+
+       /**
+        * Determine heuristically whether a byte array represents text content
+        * using CR-LF as line separator.
+        *
+        * @param raw
+        *                      the raw file content.
+        * @param length
+        *                      number of bytes in {@code raw} to evaluate.
+        * @return {@code true} if raw is likely to be CR-LF delimited text,
+        *               {@code false} otherwise
+        * @param complete
+        *                      whether {@code raw} contains the whole data
+        * @since 6.0
+        */
+       public static boolean isCrLfTextNewCandidate1(byte[] raw, int length, boolean complete) {
+               boolean has_crlf = false;
+
+               // first detect empty
+               if (length <= 0) {
+                       return false;
+               }
+
+               // next detect '\0'
+               for (int reversePtr = length - 1; reversePtr >= 0; --reversePtr) {
+                       if (raw[reversePtr] == '\0') {
+                               return false;
+                       }
+               }
+
+               // if '\r' be last, then if complete then return non-crlf
+               if (raw[length - 1] == '\r' && complete) {
+                       return false;
+               }
+
+               for (int ptr = 0; ptr < length - 1; ptr++) {
+                       byte curr = raw[ptr];
+                       if (curr == '\r') {
+                               byte next = raw[ptr + 1];
+                               if (next != '\n') {
+                                       return false;
+                               }
+                               // else
+                               // we have crlf here
+                               has_crlf = true;
+                               // as next is '\n', it can never be '\r', just skip it from next check
+                               ++ptr;
+                       }
+               }
+
+               return has_crlf;
+       }
+
+       /**
+        * Determine heuristically whether a byte array represents text content
+        * using CR-LF as line separator.
+        *
+        * @param raw
+        *                      the raw file content.
+        * @param length
+        *                      number of bytes in {@code raw} to evaluate.
+        * @return {@code true} if raw is likely to be CR-LF delimited text,
+        *               {@code false} otherwise
+        * @param complete
+        *                      whether {@code raw} contains the whole data
+        * @since 6.0
+        */
+       public static boolean isCrLfTextNewCandidate2(byte[] raw, int length, boolean complete) {
+               boolean has_crlf = false;
+
+               // first detect empty
+               if (length <= 0) {
+                       return false;
+               }
+
+               // if '\r' be last, then if complete then return non-crlf
+               byte last = raw[length - 1];
+               if (last == '\0' || last == '\r' && complete) {
+                       return false;
+               }
+
+               for (int ptr = 0; ptr < length - 1; ptr++) {
+                       byte b = raw[ptr];
+                       switch (b) {
+                               case  '\0':
+                                       return false;
+                               case '\r': {
+                                       ++ptr;
+                                       b = raw[ptr];
+                                       if (b != '\n') {
+                                               return false;
+                                       }
+                                       // else
+                                       // we have crlf here
+                                       has_crlf = true;
+                                       // as next is '\n', it can never be '\r', just skip it from next check
+                                       break;
+                               }
+                               default:
+                                       // do nothing;
+                                       break;
+                       }
+               }
+
+               return has_crlf;
+       }
+
+       /**
+        * Determine heuristically whether a byte array represents text content
+        * using CR-LF as line separator.
+        *
+        * @param raw
+        *                      the raw file content.
+        * @param length
+        *                      number of bytes in {@code raw} to evaluate.
+        * @return {@code true} if raw is likely to be CR-LF delimited text,
+        *               {@code false} otherwise
+        * @param complete
+        *                      whether {@code raw} contains the whole data
+        * @since 6.0
+        */
+       public static boolean isCrLfTextNewCandidate3(byte[] raw, int length, boolean complete) {
+               boolean has_crlf = false;
+
+               int ptr = -1;
+               byte current;
+               while (ptr < length - 2) {
+                       current = raw[++ptr];
+                       if ('\0' == current || '\r' == current && (raw[++ptr] != '\n' || !(has_crlf = true))) {
+                               return false;
+                       }
+               }
+
+               if (ptr == length - 2) {
+                       // if '\r' be last, then if isComplete then return binary
+                       current = raw[++ptr];
+                       if('\0' == current || '\r' == current && complete){
+                               return false;
+                       }
+               }
+
+               return has_crlf;
+       }
+
+
+       public static void main(String[] args) throws RunnerException {
+               Options opt = new OptionsBuilder()
+                               .include(RawTextBenchmark.class.getSimpleName())
+                               .forks(1).jvmArgs("-ea").build();
+               new Runner(opt).run();
+       }
+}
index 76dc87e72b85a153b84f30ac7b92079e403f5916..5a4d2aa45a7510c04ec35d28a36197c12f1ca7c0 100644 (file)
@@ -360,18 +360,22 @@ public class RawText extends Sequence {
                        length = maxLength;
                        isComplete = false;
                }
-               byte last = 'x'; // Just something inconspicuous.
-               for (int ptr = 0; ptr < length; ptr++) {
-                       byte curr = raw[ptr];
-                       if (isBinary(curr, last)) {
+
+               int ptr = -1;
+               byte current;
+               while (ptr < length - 2) {
+                       current = raw[++ptr];
+                       if ('\0' == current || '\r' == current && raw[++ptr] != '\n') {
                                return true;
                        }
-                       last = curr;
                }
-               if (isComplete) {
-                       // Buffer contains everything...
-                       return last == '\r'; // ... so this must be a lone CR
+
+               if (ptr == length - 2) {
+                       // if '\r' be last, then if isComplete then return binary
+                       current = raw[++ptr];
+                       return '\0' == current || '\r' == current && isComplete;
                }
+
                return false;
        }
 
@@ -467,26 +471,30 @@ public class RawText extends Sequence {
         */
        public static boolean isCrLfText(byte[] raw, int length, boolean complete) {
                boolean has_crlf = false;
-               byte last = 'x'; // Just something inconspicuous
-               for (int ptr = 0; ptr < length; ptr++) {
-                       byte curr = raw[ptr];
-                       if (isBinary(curr, last)) {
+
+               int ptr = -1;
+               byte current;
+               while (ptr < length - 2) {
+                       current = raw[++ptr];
+                       if (current == '\0') {
                                return false;
                        }
-                       if (curr == '\n' && last == '\r') {
+                       if (current == '\r') {
+                               if (raw[++ptr] != '\n') {
+                                       return false;
+                               }
                                has_crlf = true;
                        }
-                       last = curr;
                }
-               if (last == '\r') {
-                       if (complete) {
-                               // Lone CR: it's binary after all.
+
+               if (ptr == length - 2) {
+                       // if '\r' be last, then if isComplete then return binary
+                       current = raw[++ptr];
+                       if('\0' == current || '\r' == current && complete){
                                return false;
                        }
-                       // Tough call. If the next byte, which we don't have, would be a
-                       // '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide
-                       // based on what we already scanned; it wasn't binary until now.
                }
+
                return has_crlf;
        }
 
@@ -578,4 +586,5 @@ public class RawText extends Sequence {
                        return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz));
                }
        }
+
 }