Merge "RevCommit: Better support invalid encoding headers"

author: Shawn Pearce <spearce@spearce.org> 2016-01-11 15:48:14 -0500
committer: Gerrit Code Review @ Eclipse.org <gerrit@eclipse.org> 2016-01-11 15:48:16 -0500
commit: 8f31aa7c285631467b63c78cc71caf6e40333529 (patch)
tree: f26705ba52f357b3d659f9a09a559360a2588a4b /org.eclipse.jgit
parent: 0f8743d4d7a4f3af1eccea60d45d51d13f1a2ad4 (diff)
parent: 31d92ace5b0b4ca66b3fa191b8a1207d6e8fecc6 (diff)
download: jgit-8f31aa7c285631467b63c78cc71caf6e40333529.tar.gz
jgit-8f31aa7c285631467b63c78cc71caf6e40333529.zip
3 files changed, 139 insertions, 56 deletions
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java
index c23e4e3288..e67ada6022 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java
@@ -44,12 +44,17 @@
 
 package org.eclipse.jgit.revwalk;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.io.IOException;
 import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 
+import org.eclipse.jgit.annotations.Nullable;
 import org.eclipse.jgit.errors.IncorrectObjectTypeException;
 import org.eclipse.jgit.errors.MissingObjectException;
 import org.eclipse.jgit.lib.AnyObjectId;
@@ -441,12 +446,12 @@ public class RevCommit extends RevObject {
 	 * @return decoded commit message as a string. Never null.
 	 */
 	public final String getFullMessage() {
-		final byte[] raw = buffer;
-		final int msgB = RawParseUtils.commitMessage(raw, 0);
-		if (msgB < 0)
+		byte[] raw = buffer;
+		int msgB = RawParseUtils.commitMessage(raw, 0);
+		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
-		final Charset enc = RawParseUtils.parseEncoding(raw);
-		return RawParseUtils.decode(enc, raw, msgB, raw.length);
+		}
+		return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length);
 	}
 
 	/**
@@ -465,16 +470,17 @@ public class RevCommit extends RevObject {
 	 *         spanned multiple lines. Embedded LFs are converted to spaces.
 	 */
 	public final String getShortMessage() {
-		final byte[] raw = buffer;
-		final int msgB = RawParseUtils.commitMessage(raw, 0);
-		if (msgB < 0)
+		byte[] raw = buffer;
+		int msgB = RawParseUtils.commitMessage(raw, 0);
+		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
+		}
 
-		final Charset enc = RawParseUtils.parseEncoding(raw);
-		final int msgE = RawParseUtils.endOfParagraph(raw, msgB);
-		String str = RawParseUtils.decode(enc, raw, msgB, msgE);
-		if (hasLF(raw, msgB, msgE))
+		int msgE = RawParseUtils.endOfParagraph(raw, msgB);
+		String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE);
+		if (hasLF(raw, msgB, msgE)) {
 			str = StringUtils.replaceLineBreaksWithSpace(str);
+		}
 		return str;
 	}
 
@@ -488,18 +494,49 @@ public class RevCommit extends RevObject {
 	/**
 	 * Determine the encoding of the commit message buffer.
 	 * <p>
+	 * Locates the "encoding" header (if present) and returns its value. Due to
+	 * corruption in the wild this may be an invalid encoding name that is not
+	 * recognized by any character encoding library.
+	 * <p>
+	 * If no encoding header is present, null.
+	 *
+	 * @return the preferred encoding of {@link #getRawBuffer()}; or null.
+	 * @since 4.2
+	 */
+	@Nullable
+	public final String getEncodingName() {
+		return RawParseUtils.parseEncodingName(buffer);
+	}
+
+	/**
+	 * Determine the encoding of the commit message buffer.
+	 * <p>
 	 * Locates the "encoding" header (if present) and then returns the proper
 	 * character set to apply to this buffer to evaluate its contents as
 	 * character data.
 	 * <p>
-	 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
+	 * If no encoding header is present {@code UTF-8} is assumed.
 	 *
 	 * @return the preferred encoding of {@link #getRawBuffer()}.
+	 * @throws IllegalCharsetNameException
+	 *             if the character set requested by the encoding header is
+	 *             malformed and unsupportable.
+	 * @throws UnsupportedCharsetException
+	 *             if the JRE does not support the character set requested by
+	 *             the encoding header.
 	 */
 	public final Charset getEncoding() {
 		return RawParseUtils.parseEncoding(buffer);
 	}
 
+	private Charset guessEncoding() {
+		try {
+			return getEncoding();
+		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+			return UTF_8;
+		}
+	}
+
 	/**
 	 * Parse the footer lines (e.g. "Signed-off-by") for machine processing.
 	 * <p>
@@ -529,7 +566,7 @@ public class RevCommit extends RevObject {
 
 		final int msgB = RawParseUtils.commitMessage(raw, 0);
 		final ArrayList<FooterLine> r = new ArrayList<FooterLine>(4);
-		final Charset enc = getEncoding();
+		final Charset enc = guessEncoding();
 		for (;;) {
 			ptr = RawParseUtils.prevLF(raw, ptr);
 			if (ptr <= msgB)
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java
index bf2785e0d7..81a54bf7ea 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java
@@ -45,8 +45,12 @@
 
 package org.eclipse.jgit.revwalk;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.io.IOException;
 import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
 
 import org.eclipse.jgit.errors.CorruptObjectException;
 import org.eclipse.jgit.errors.IncorrectObjectTypeException;
@@ -162,7 +166,7 @@ public class RevTag extends RevObject {
 
 		int p = pos.value += 4; // "tag "
 		final int nameEnd = RawParseUtils.nextLF(rawTag, p) - 1;
-		tagName = RawParseUtils.decode(Constants.CHARSET, rawTag, p, nameEnd);
+		tagName = RawParseUtils.decode(UTF_8, rawTag, p, nameEnd);
 
 		if (walk.isRetainBody())
 			buffer = rawTag;
@@ -207,12 +211,12 @@ public class RevTag extends RevObject {
 	 * @return decoded tag message as a string. Never null.
 	 */
 	public final String getFullMessage() {
-		final byte[] raw = buffer;
-		final int msgB = RawParseUtils.tagMessage(raw, 0);
-		if (msgB < 0)
+		byte[] raw = buffer;
+		int msgB = RawParseUtils.tagMessage(raw, 0);
+		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
-		final Charset enc = RawParseUtils.parseEncoding(raw);
-		return RawParseUtils.decode(enc, raw, msgB, raw.length);
+		}
+		return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length);
 	}
 
 	/**
@@ -231,19 +235,28 @@ public class RevTag extends RevObject {
 	 *         multiple lines. Embedded LFs are converted to spaces.
 	 */
 	public final String getShortMessage() {
-		final byte[] raw = buffer;
-		final int msgB = RawParseUtils.tagMessage(raw, 0);
-		if (msgB < 0)
+		byte[] raw = buffer;
+		int msgB = RawParseUtils.tagMessage(raw, 0);
+		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
+		}
 
-		final Charset enc = RawParseUtils.parseEncoding(raw);
-		final int msgE = RawParseUtils.endOfParagraph(raw, msgB);
-		String str = RawParseUtils.decode(enc, raw, msgB, msgE);
-		if (RevCommit.hasLF(raw, msgB, msgE))
+		int msgE = RawParseUtils.endOfParagraph(raw, msgB);
+		String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE);
+		if (RevCommit.hasLF(raw, msgB, msgE)) {
 			str = StringUtils.replaceLineBreaksWithSpace(str);
+		}
 		return str;
 	}
 
+	private Charset guessEncoding() {
+		try {
+			return RawParseUtils.parseEncoding(buffer);
+		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+			return UTF_8;
+		}
+	}
+
 	/**
 	 * Get a reference to the object this tag was placed on.
 	 * <p>
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java
index a20e0b0603..f2955f7e6b 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java
@@ -44,6 +44,8 @@
 
 package org.eclipse.jgit.util;
 
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.eclipse.jgit.lib.ObjectChecker.author;
 import static org.eclipse.jgit.lib.ObjectChecker.committer;
 import static org.eclipse.jgit.lib.ObjectChecker.encoding;
@@ -60,6 +62,7 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 
+import org.eclipse.jgit.annotations.Nullable;
 import org.eclipse.jgit.lib.Constants;
 import org.eclipse.jgit.lib.PersonIdent;
 
@@ -70,7 +73,7 @@ public final class RawParseUtils {
 	 *
 	 * @since 2.2
 	 */
-	public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$
+	public static final Charset UTF8_CHARSET = UTF_8;
 
 	private static final byte[] digits10;
 
@@ -81,9 +84,9 @@ public final class RawParseUtils {
 	private static final Map<String, Charset> encodingAliases;
 
 	static {
-		encodingAliases = new HashMap<String, Charset>();
-		encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$
-		encodingAliases.put("'utf8'", Charset.forName("UTF-8")); //$NON-NLS-1$ //$NON-NLS-2$
+		encodingAliases = new HashMap<>();
+		encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
+		encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
 
 		digits10 = new byte['9' + 1];
 		Arrays.fill(digits10, (byte) -1);
@@ -672,35 +675,60 @@ public final class RawParseUtils {
 	}
 
 	/**
+	 * Parse the "encoding " header as a string.
+	 * <p>
+	 * Locates the "encoding " header (if present) and returns its value.
+	 *
+	 * @param b
+	 *            buffer to scan.
+	 * @return the encoding header as specified in the commit; null if the
+	 *         header was not present and should be assumed.
+	 * @since 4.2
+	 */
+	@Nullable
+	public static String parseEncodingName(final byte[] b) {
+		int enc = encoding(b, 0);
+		if (enc < 0) {
+			return null;
+		}
+		int lf = nextLF(b, enc);
+		return decode(UTF_8, b, enc, lf - 1);
+	}
+
+	/**
 	 * Parse the "encoding " header into a character set reference.
 	 * <p>
 	 * Locates the "encoding " header (if present) by first calling
 	 * {@link #encoding(byte[], int)} and then returns the proper character set
 	 * to apply to this buffer to evaluate its contents as character data.
 	 * <p>
-	 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
+	 * If no encoding header is present {@code UTF-8} is assumed.
 	 *
 	 * @param b
 	 *            buffer to scan.
 	 * @return the Java character set representation. Never null.
+	 * @throws IllegalCharsetNameException
+	 *             if the character set requested by the encoding header is
+	 *             malformed and unsupportable.
+	 * @throws UnsupportedCharsetException
+	 *             if the JRE does not support the character set requested by
+	 *             the encoding header.
 	 */
 	public static Charset parseEncoding(final byte[] b) {
-		final int enc = encoding(b, 0);
-		if (enc < 0)
-			return Constants.CHARSET;
-		final int lf = nextLF(b, enc);
-		String decoded = decode(Constants.CHARSET, b, enc, lf - 1);
+		String enc = parseEncodingName(b);
+		if (enc == null) {
+			return UTF_8;
+		}
+
+		String name = enc.trim();
 		try {
-			return Charset.forName(decoded);
-		} catch (IllegalCharsetNameException badName) {
-			Charset aliased = charsetForAlias(decoded);
-			if (aliased != null)
-				return aliased;
-			throw badName;
-		} catch (UnsupportedCharsetException badName) {
-			Charset aliased = charsetForAlias(decoded);
-			if (aliased != null)
+			return Charset.forName(name);
+		} catch (IllegalCharsetNameException
+				| UnsupportedCharsetException badName) {
+			Charset aliased = charsetForAlias(name);
+			if (aliased != null) {
 				return aliased;
+			}
 			throw badName;
 		}
 	}
@@ -739,7 +767,15 @@ public final class RawParseUtils {
 	 *         parsed.
 	 */
 	public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
-		final Charset cs = parseEncoding(raw);
+		Charset cs;
+		try {
+			cs = parseEncoding(raw);
+		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+			// Assume UTF-8 for person identities, usually this is correct.
+			// If not decode() will fall back to the ISO-8859-1 encoding.
+			cs = UTF_8;
+		}
+
 		final int emailB = nextLF(raw, nameB, '<');
 		final int emailE = nextLF(raw, emailB, '>');
 		if (emailB >= raw.length || raw[emailB] == '\n' ||
@@ -887,7 +923,7 @@ public final class RawParseUtils {
 	 */
 	public static String decode(final byte[] buffer, final int start,
 			final int end) {
-		return decode(Constants.CHARSET, buffer, start, end);
+		return decode(UTF_8, buffer, start, end);
 	}
 
 	/**
@@ -961,23 +997,21 @@ public final class RawParseUtils {
 	public static String decodeNoFallback(final Charset cs,
 			final byte[] buffer, final int start, final int end)
 			throws CharacterCodingException {
-		final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
+		ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
 		b.mark();
 
 		// Try our built-in favorite. The assumption here is that
 		// decoding will fail if the data is not actually encoded
 		// using that encoder.
-		//
 		try {
-			return decode(b, Constants.CHARSET);
+			return decode(b, UTF_8);
 		} catch (CharacterCodingException e) {
 			b.reset();
 		}
 
-		if (!cs.equals(Constants.CHARSET)) {
+		if (!cs.equals(UTF_8)) {
 			// Try the suggested encoding, it might be right since it was
 			// provided by the caller.
-			//
 			try {
 				return decode(b, cs);
 			} catch (CharacterCodingException e) {
@@ -987,9 +1021,8 @@ public final class RawParseUtils {
 
 		// Try the default character set. A small group of people
 		// might actually use the same (or very similar) locale.
-		//
-		final Charset defcs = Charset.defaultCharset();
-		if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
+		Charset defcs = Charset.defaultCharset();
+		if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
 			try {
 				return decode(b, defcs);
 			} catch (CharacterCodingException e) {
author	Shawn Pearce <spearce@spearce.org>	2016-01-11 15:48:14 -0500
committer	Gerrit Code Review @ Eclipse.org <gerrit@eclipse.org>	2016-01-11 15:48:16 -0500
commit	8f31aa7c285631467b63c78cc71caf6e40333529 (patch)
tree	f26705ba52f357b3d659f9a09a559360a2588a4b /org.eclipse.jgit
parent	0f8743d4d7a4f3af1eccea60d45d51d13f1a2ad4 (diff)
parent	31d92ace5b0b4ca66b3fa191b8a1207d6e8fecc6 (diff)
download	jgit-8f31aa7c285631467b63c78cc71caf6e40333529.tar.gz jgit-8f31aa7c285631467b63c78cc71caf6e40333529.zip