aboutsummaryrefslogtreecommitdiffstats
path: root/org.eclipse.jgit
diff options
context:
space:
mode:
authorShawn Pearce <spearce@spearce.org>2016-01-11 12:30:35 -0800
committerShawn Pearce <spearce@spearce.org>2016-01-11 12:30:35 -0800
commit31d92ace5b0b4ca66b3fa191b8a1207d6e8fecc6 (patch)
treefd3ae6103257768062d2f0f6a31018b3d8f0122f /org.eclipse.jgit
parent24cd8e170d377cef87166d43ca25bfa2340755c6 (diff)
downloadjgit-31d92ace5b0b4ca66b3fa191b8a1207d6e8fecc6.tar.gz
jgit-31d92ace5b0b4ca66b3fa191b8a1207d6e8fecc6.zip
RevCommit: Better support invalid encoding headers
With this support we no longer need the 'utf-8' alias. UTF-8 will be automatically tried when the encoding header is not recognized and used if the character sequence cleanly decodes as UTF-8. Modernize some of the references to use StandardCharsets. Change-Id: I4c0c88750475560e1f2263180c4a98eb8febeca0
Diffstat (limited to 'org.eclipse.jgit')
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java65
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java39
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java91
3 files changed, 139 insertions, 56 deletions
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java
index c23e4e3288..e67ada6022 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java
@@ -44,12 +44,17 @@
package org.eclipse.jgit.revwalk;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.io.IOException;
import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import org.eclipse.jgit.annotations.Nullable;
import org.eclipse.jgit.errors.IncorrectObjectTypeException;
import org.eclipse.jgit.errors.MissingObjectException;
import org.eclipse.jgit.lib.AnyObjectId;
@@ -441,12 +446,12 @@ public class RevCommit extends RevObject {
* @return decoded commit message as a string. Never null.
*/
public final String getFullMessage() {
- final byte[] raw = buffer;
- final int msgB = RawParseUtils.commitMessage(raw, 0);
- if (msgB < 0)
+ byte[] raw = buffer;
+ int msgB = RawParseUtils.commitMessage(raw, 0);
+ if (msgB < 0) {
return ""; //$NON-NLS-1$
- final Charset enc = RawParseUtils.parseEncoding(raw);
- return RawParseUtils.decode(enc, raw, msgB, raw.length);
+ }
+ return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length);
}
/**
@@ -465,16 +470,17 @@ public class RevCommit extends RevObject {
* spanned multiple lines. Embedded LFs are converted to spaces.
*/
public final String getShortMessage() {
- final byte[] raw = buffer;
- final int msgB = RawParseUtils.commitMessage(raw, 0);
- if (msgB < 0)
+ byte[] raw = buffer;
+ int msgB = RawParseUtils.commitMessage(raw, 0);
+ if (msgB < 0) {
return ""; //$NON-NLS-1$
+ }
- final Charset enc = RawParseUtils.parseEncoding(raw);
- final int msgE = RawParseUtils.endOfParagraph(raw, msgB);
- String str = RawParseUtils.decode(enc, raw, msgB, msgE);
- if (hasLF(raw, msgB, msgE))
+ int msgE = RawParseUtils.endOfParagraph(raw, msgB);
+ String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE);
+ if (hasLF(raw, msgB, msgE)) {
str = StringUtils.replaceLineBreaksWithSpace(str);
+ }
return str;
}
@@ -488,18 +494,49 @@ public class RevCommit extends RevObject {
/**
* Determine the encoding of the commit message buffer.
* <p>
+ * Locates the "encoding" header (if present) and returns its value. Due to
+ * corruption in the wild this may be an invalid encoding name that is not
+ * recognized by any character encoding library.
+ * <p>
+ * If no encoding header is present, null.
+ *
+ * @return the preferred encoding of {@link #getRawBuffer()}; or null.
+ * @since 4.2
+ */
+ @Nullable
+ public final String getEncodingName() {
+ return RawParseUtils.parseEncodingName(buffer);
+ }
+
+ /**
+ * Determine the encoding of the commit message buffer.
+ * <p>
* Locates the "encoding" header (if present) and then returns the proper
* character set to apply to this buffer to evaluate its contents as
* character data.
* <p>
- * If no encoding header is present, {@link Constants#CHARSET} is assumed.
+ * If no encoding header is present {@code UTF-8} is assumed.
*
* @return the preferred encoding of {@link #getRawBuffer()}.
+ * @throws IllegalCharsetNameException
+ * if the character set requested by the encoding header is
+ * malformed and unsupportable.
+ * @throws UnsupportedCharsetException
+ * if the JRE does not support the character set requested by
+ * the encoding header.
*/
public final Charset getEncoding() {
return RawParseUtils.parseEncoding(buffer);
}
+ private Charset guessEncoding() {
+ try {
+ return getEncoding();
+ } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+ return UTF_8;
+ }
+ }
+
/**
* Parse the footer lines (e.g. "Signed-off-by") for machine processing.
* <p>
@@ -529,7 +566,7 @@ public class RevCommit extends RevObject {
final int msgB = RawParseUtils.commitMessage(raw, 0);
final ArrayList<FooterLine> r = new ArrayList<FooterLine>(4);
- final Charset enc = getEncoding();
+ final Charset enc = guessEncoding();
for (;;) {
ptr = RawParseUtils.prevLF(raw, ptr);
if (ptr <= msgB)
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java
index bf2785e0d7..81a54bf7ea 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java
@@ -45,8 +45,12 @@
package org.eclipse.jgit.revwalk;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.io.IOException;
import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
import org.eclipse.jgit.errors.CorruptObjectException;
import org.eclipse.jgit.errors.IncorrectObjectTypeException;
@@ -162,7 +166,7 @@ public class RevTag extends RevObject {
int p = pos.value += 4; // "tag "
final int nameEnd = RawParseUtils.nextLF(rawTag, p) - 1;
- tagName = RawParseUtils.decode(Constants.CHARSET, rawTag, p, nameEnd);
+ tagName = RawParseUtils.decode(UTF_8, rawTag, p, nameEnd);
if (walk.isRetainBody())
buffer = rawTag;
@@ -207,12 +211,12 @@ public class RevTag extends RevObject {
* @return decoded tag message as a string. Never null.
*/
public final String getFullMessage() {
- final byte[] raw = buffer;
- final int msgB = RawParseUtils.tagMessage(raw, 0);
- if (msgB < 0)
+ byte[] raw = buffer;
+ int msgB = RawParseUtils.tagMessage(raw, 0);
+ if (msgB < 0) {
return ""; //$NON-NLS-1$
- final Charset enc = RawParseUtils.parseEncoding(raw);
- return RawParseUtils.decode(enc, raw, msgB, raw.length);
+ }
+ return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length);
}
/**
@@ -231,19 +235,28 @@ public class RevTag extends RevObject {
* multiple lines. Embedded LFs are converted to spaces.
*/
public final String getShortMessage() {
- final byte[] raw = buffer;
- final int msgB = RawParseUtils.tagMessage(raw, 0);
- if (msgB < 0)
+ byte[] raw = buffer;
+ int msgB = RawParseUtils.tagMessage(raw, 0);
+ if (msgB < 0) {
return ""; //$NON-NLS-1$
+ }
- final Charset enc = RawParseUtils.parseEncoding(raw);
- final int msgE = RawParseUtils.endOfParagraph(raw, msgB);
- String str = RawParseUtils.decode(enc, raw, msgB, msgE);
- if (RevCommit.hasLF(raw, msgB, msgE))
+ int msgE = RawParseUtils.endOfParagraph(raw, msgB);
+ String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE);
+ if (RevCommit.hasLF(raw, msgB, msgE)) {
str = StringUtils.replaceLineBreaksWithSpace(str);
+ }
return str;
}
+ private Charset guessEncoding() {
+ try {
+ return RawParseUtils.parseEncoding(buffer);
+ } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+ return UTF_8;
+ }
+ }
+
/**
* Get a reference to the object this tag was placed on.
* <p>
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java
index a20e0b0603..f2955f7e6b 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java
@@ -44,6 +44,8 @@
package org.eclipse.jgit.util;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.eclipse.jgit.lib.ObjectChecker.author;
import static org.eclipse.jgit.lib.ObjectChecker.committer;
import static org.eclipse.jgit.lib.ObjectChecker.encoding;
@@ -60,6 +62,7 @@ import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
+import org.eclipse.jgit.annotations.Nullable;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.PersonIdent;
@@ -70,7 +73,7 @@ public final class RawParseUtils {
*
* @since 2.2
*/
- public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$
+ public static final Charset UTF8_CHARSET = UTF_8;
private static final byte[] digits10;
@@ -81,9 +84,9 @@ public final class RawParseUtils {
private static final Map<String, Charset> encodingAliases;
static {
- encodingAliases = new HashMap<String, Charset>();
- encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$
- encodingAliases.put("'utf8'", Charset.forName("UTF-8")); //$NON-NLS-1$ //$NON-NLS-2$
+ encodingAliases = new HashMap<>();
+ encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
+ encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
digits10 = new byte['9' + 1];
Arrays.fill(digits10, (byte) -1);
@@ -672,35 +675,60 @@ public final class RawParseUtils {
}
/**
+ * Parse the "encoding " header as a string.
+ * <p>
+ * Locates the "encoding " header (if present) and returns its value.
+ *
+ * @param b
+ * buffer to scan.
+ * @return the encoding header as specified in the commit; null if the
+ * header was not present and should be assumed.
+ * @since 4.2
+ */
+ @Nullable
+ public static String parseEncodingName(final byte[] b) {
+ int enc = encoding(b, 0);
+ if (enc < 0) {
+ return null;
+ }
+ int lf = nextLF(b, enc);
+ return decode(UTF_8, b, enc, lf - 1);
+ }
+
+ /**
* Parse the "encoding " header into a character set reference.
* <p>
* Locates the "encoding " header (if present) by first calling
* {@link #encoding(byte[], int)} and then returns the proper character set
* to apply to this buffer to evaluate its contents as character data.
* <p>
- * If no encoding header is present, {@link Constants#CHARSET} is assumed.
+ * If no encoding header is present {@code UTF-8} is assumed.
*
* @param b
* buffer to scan.
* @return the Java character set representation. Never null.
+ * @throws IllegalCharsetNameException
+ * if the character set requested by the encoding header is
+ * malformed and unsupportable.
+ * @throws UnsupportedCharsetException
+ * if the JRE does not support the character set requested by
+ * the encoding header.
*/
public static Charset parseEncoding(final byte[] b) {
- final int enc = encoding(b, 0);
- if (enc < 0)
- return Constants.CHARSET;
- final int lf = nextLF(b, enc);
- String decoded = decode(Constants.CHARSET, b, enc, lf - 1);
+ String enc = parseEncodingName(b);
+ if (enc == null) {
+ return UTF_8;
+ }
+
+ String name = enc.trim();
try {
- return Charset.forName(decoded);
- } catch (IllegalCharsetNameException badName) {
- Charset aliased = charsetForAlias(decoded);
- if (aliased != null)
- return aliased;
- throw badName;
- } catch (UnsupportedCharsetException badName) {
- Charset aliased = charsetForAlias(decoded);
- if (aliased != null)
+ return Charset.forName(name);
+ } catch (IllegalCharsetNameException
+ | UnsupportedCharsetException badName) {
+ Charset aliased = charsetForAlias(name);
+ if (aliased != null) {
return aliased;
+ }
throw badName;
}
}
@@ -739,7 +767,15 @@ public final class RawParseUtils {
* parsed.
*/
public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
- final Charset cs = parseEncoding(raw);
+ Charset cs;
+ try {
+ cs = parseEncoding(raw);
+ } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+ // Assume UTF-8 for person identities, usually this is correct.
+ // If not decode() will fall back to the ISO-8859-1 encoding.
+ cs = UTF_8;
+ }
+
final int emailB = nextLF(raw, nameB, '<');
final int emailE = nextLF(raw, emailB, '>');
if (emailB >= raw.length || raw[emailB] == '\n' ||
@@ -887,7 +923,7 @@ public final class RawParseUtils {
*/
public static String decode(final byte[] buffer, final int start,
final int end) {
- return decode(Constants.CHARSET, buffer, start, end);
+ return decode(UTF_8, buffer, start, end);
}
/**
@@ -961,23 +997,21 @@ public final class RawParseUtils {
public static String decodeNoFallback(final Charset cs,
final byte[] buffer, final int start, final int end)
throws CharacterCodingException {
- final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
+ ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
b.mark();
// Try our built-in favorite. The assumption here is that
// decoding will fail if the data is not actually encoded
// using that encoder.
- //
try {
- return decode(b, Constants.CHARSET);
+ return decode(b, UTF_8);
} catch (CharacterCodingException e) {
b.reset();
}
- if (!cs.equals(Constants.CHARSET)) {
+ if (!cs.equals(UTF_8)) {
// Try the suggested encoding, it might be right since it was
// provided by the caller.
- //
try {
return decode(b, cs);
} catch (CharacterCodingException e) {
@@ -987,9 +1021,8 @@ public final class RawParseUtils {
// Try the default character set. A small group of people
// might actually use the same (or very similar) locale.
- //
- final Charset defcs = Charset.defaultCharset();
- if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
+ Charset defcs = Charset.defaultCharset();
+ if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
try {
return decode(b, defcs);
} catch (CharacterCodingException e) {