aboutsummaryrefslogtreecommitdiffstats
path: root/org.eclipse.jgit/src/org
diff options
context:
space:
mode:
authorThomas Wolf <thomas.wolf@paranor.ch>2021-10-13 00:28:38 +0200
committerThomas Wolf <thomas.wolf@paranor.ch>2021-10-30 23:05:22 +0200
commitc2204bb6835e4e6dc666bb34eaea910fb1484092 (patch)
treeed31d5d1009a7968fc3e75d72de71fee392f8715 /org.eclipse.jgit/src/org
parent331a489c7f116f98da6e6b6614af73a0a255b8e8 (diff)
downloadjgit-c2204bb6835e4e6dc666bb34eaea910fb1484092.tar.gz
jgit-c2204bb6835e4e6dc666bb34eaea910fb1484092.zip
Make the buffer size for text/binary detection configurable
The various streams used in JGit for text/binary and CR-LF detection used different buffer sizes. Most used 8000, but one used 8KiB, and one used 8096 (SIC!) bytes. Considering only the first 8kB of a file/blob is not sufficient; it may give behavior incompatible with C git. C git considers the whole blob; since it uses memory-mapped files it can do so with acceptable performance. Doing this in JGit would most likely incur a noticeable performance penalty. But 8kB is a bit small; in the file in bug 576971 the limit was hit before the first CR-LF, which occurred on line 155 at offset 9759 in the file. Make RawText.FIRST_FEW_BYTES only a default and minimum setting, and set it to 8KiB. Make the actual buffer size configurable: provide static methods getBufferSize() and setBuffersize(), and use getBufferSize() throughout instead of the constant. This enables users of the JGit library to set their own possibly larger buffer size. Bug: 576971 Change-Id: I447762c9a5147a521f73d2864ba59ed89f555d54 Signed-off-by: Thomas Wolf <thomas.wolf@paranor.ch>
Diffstat (limited to 'org.eclipse.jgit/src/org')
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java67
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java9
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java9
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java11
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java12
5 files changed, 73 insertions, 35 deletions
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
index d09da019dd..914fa5f6f7 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
@@ -17,6 +17,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
+import java.util.concurrent.atomic.AtomicInteger;
import org.eclipse.jgit.errors.BinaryBlobException;
import org.eclipse.jgit.errors.LargeObjectException;
@@ -38,11 +39,20 @@ import org.eclipse.jgit.util.RawParseUtils;
* they are converting from "line number" to "element index".
*/
public class RawText extends Sequence {
+
/** A RawText of length 0 */
public static final RawText EMPTY_TEXT = new RawText(new byte[0]);
- /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */
- static final int FIRST_FEW_BYTES = 8000;
+ /**
+ * Default and minimum for {@link #BUFFER_SIZE}.
+ */
+ private static final int FIRST_FEW_BYTES = 8 * 1024;
+
+ /**
+ * Number of bytes to check for heuristics in {@link #isBinary(byte[])}.
+ */
+ private static final AtomicInteger BUFFER_SIZE = new AtomicInteger(
+ FIRST_FEW_BYTES);
/** The file content for this sequence. */
protected final byte[] content;
@@ -248,6 +258,33 @@ public class RawText extends Sequence {
}
/**
+ * Obtains the buffer size to use for analyzing whether certain content is
+ * text or binary, or what line endings are used if it's text.
+ *
+ * @return the buffer size, by default {@link #FIRST_FEW_BYTES} bytes
+ * @since 6.0
+ */
+ public static int getBufferSize() {
+ return BUFFER_SIZE.get();
+ }
+
+ /**
+ * Sets the buffer size to use for analyzing whether certain content is text
+ * or binary, or what line endings are used if it's text. If the given
+ * {@code bufferSize} is smaller than {@link #FIRST_FEW_BYTES} set the
+ * buffer size to {@link #FIRST_FEW_BYTES}.
+ *
+ * @param bufferSize
+ * Size to set
+ * @return the size actually set
+ * @since 6.0
+ */
+ public static int setBufferSize(int bufferSize) {
+ int newSize = Math.max(FIRST_FEW_BYTES, bufferSize);
+ return BUFFER_SIZE.updateAndGet(curr -> newSize);
+ }
+
+ /**
* Determine heuristically whether the bytes contained in a stream
* represents binary (as opposed to text) content.
*
@@ -263,7 +300,7 @@ public class RawText extends Sequence {
* if input stream could not be read
*/
public static boolean isBinary(InputStream raw) throws IOException {
- final byte[] buffer = new byte[FIRST_FEW_BYTES];
+ final byte[] buffer = new byte[getBufferSize()];
int cnt = 0;
while (cnt < buffer.length) {
final int n = raw.read(buffer, cnt, buffer.length - cnt);
@@ -287,13 +324,16 @@ public class RawText extends Sequence {
* @return true if raw is likely to be a binary file, false otherwise
*/
public static boolean isBinary(byte[] raw, int length) {
- // Same heuristic as C Git
- if (length > FIRST_FEW_BYTES)
- length = FIRST_FEW_BYTES;
- for (int ptr = 0; ptr < length; ptr++)
- if (raw[ptr] == '\0')
+ // Same heuristic as C Git (except for the buffer size)
+ int maxLength = getBufferSize();
+ if (length > maxLength) {
+ length = maxLength;
+ }
+ for (int ptr = 0; ptr < length; ptr++) {
+ if (raw[ptr] == '\0') {
return true;
-
+ }
+ }
return false;
}
@@ -329,7 +369,7 @@ public class RawText extends Sequence {
* @since 5.3
*/
public static boolean isCrLfText(InputStream raw) throws IOException {
- byte[] buffer = new byte[FIRST_FEW_BYTES];
+ byte[] buffer = new byte[getBufferSize()];
int cnt = 0;
while (cnt < buffer.length) {
int n = raw.read(buffer, cnt, buffer.length - cnt);
@@ -409,15 +449,16 @@ public class RawText extends Sequence {
throw new BinaryBlobException();
}
- if (sz <= FIRST_FEW_BYTES) {
- byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES);
+ int bufferSize = getBufferSize();
+ if (sz <= bufferSize) {
+ byte[] data = ldr.getCachedBytes(bufferSize);
if (isBinary(data)) {
throw new BinaryBlobException();
}
return new RawText(data);
}
- byte[] head = new byte[FIRST_FEW_BYTES];
+ byte[] head = new byte[bufferSize];
try (InputStream stream = ldr.openStream()) {
int off = 0;
int left = head.length;
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java
index 9da890343f..1b03d097b6 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java
@@ -21,16 +21,15 @@ import org.eclipse.jgit.diff.RawText;
*
* Existing CRLF are not expanded to CRCRLF, but retained as is.
*
- * Optionally, a binary check on the first 8000 bytes is performed and in case
- * of binary files, canonicalization is turned off (for the complete file).
+ * Optionally, a binary check on the first {@link RawText#getBufferSize()} bytes
+ * is performed and in case of binary files, canonicalization is turned off (for
+ * the complete file).
*/
public class AutoCRLFInputStream extends InputStream {
- static final int BUFFER_SIZE = 8096;
-
private final byte[] single = new byte[1];
- private final byte[] buf = new byte[BUFFER_SIZE];
+ private final byte[] buf = new byte[RawText.getBufferSize()];
private final InputStream in;
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java
index 97fe01e5d9..05e271febd 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java
@@ -20,18 +20,17 @@ import org.eclipse.jgit.diff.RawText;
*
* Existing CRLF are not expanded to CRCRLF, but retained as is.
*
- * A binary check on the first 8000 bytes is performed and in case of binary
- * files, canonicalization is turned off (for the complete file).
+ * A binary check on the first {@link RawText#getBufferSize()} bytes is
+ * performed and in case of binary files, canonicalization is turned off (for
+ * the complete file).
*/
public class AutoCRLFOutputStream extends OutputStream {
- static final int BUFFER_SIZE = 8000;
-
private final OutputStream out;
private int buf = -1;
- private byte[] binbuf = new byte[BUFFER_SIZE];
+ private byte[] binbuf = new byte[RawText.getBufferSize()];
private byte[] onebytebuf = new byte[1];
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java
index 0e335a9dc4..b6d1848b3a 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java
@@ -25,10 +25,11 @@ import org.eclipse.jgit.diff.RawText;
* Existing single CR are not changed to LF but are retained as is.
* </p>
* <p>
- * Optionally, a binary check on the first 8kB is performed and in case of
- * binary files, canonicalization is turned off (for the complete file). If
- * binary checking determines that the input is CR/LF-delimited text and the
- * stream has been created for checkout, canonicalization is also turned off.
+ * Optionally, a binary check on the first {@link RawText#getBufferSize()} bytes
+ * is performed and in case of binary files, canonicalization is turned off (for
+ * the complete file). If binary checking determines that the input is
+ * CR/LF-delimited text and the stream has been created for checkout,
+ * canonicalization is also turned off.
* </p>
*
* @since 4.3
@@ -64,7 +65,7 @@ public class AutoLFInputStream extends InputStream {
private final byte[] single = new byte[1];
- private final byte[] buf = new byte[8 * 1024];
+ private final byte[] buf = new byte[RawText.getBufferSize()];
private final InputStream in;
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java
index 195fdb4213..e08a53f502 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java
@@ -22,23 +22,21 @@ import org.eclipse.jgit.diff.RawText;
* Existing single CR are not changed to LF, but retained as is.
* </p>
* <p>
- * A binary check on the first 8000 bytes is performed and in case of binary
- * files, canonicalization is turned off (for the complete file). If the binary
- * check determines that the input is not binary but text with CR/LF,
- * canonicalization is also turned off.
+ * A binary check on the first {@link RawText#getBufferSize()} bytes is
+ * performed and in case of binary files, canonicalization is turned off (for
+ * the complete file). If the binary check determines that the input is not
+ * binary but text with CR/LF, canonicalization is also turned off.
* </p>
*
* @since 4.3
*/
public class AutoLFOutputStream extends OutputStream {
- static final int BUFFER_SIZE = 8000;
-
private final OutputStream out;
private int buf = -1;
- private byte[] binbuf = new byte[BUFFER_SIZE];
+ private byte[] binbuf = new byte[RawText.getBufferSize()];
private byte[] onebytebuf = new byte[1];