With this support we no longer need the 'utf-8' alias. UTF-8 will be automatically tried when the encoding header is not recognized and used if the character sequence cleanly decodes as UTF-8. Modernize some of the references to use StandardCharsets. Change-Id: I4c0c88750475560e1f2263180c4a98eb8febeca0

8 years ago · 31d92ace5b
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevCommitParseTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevCommitParseTest.java
@@ -43,13 +43,18 @@

 package org.eclipse.jgit.revwalk;

 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;

 import java.io.ByteArrayOutputStream;
 import java.io.UnsupportedEncodingException;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.TimeZone;

 import org.eclipse.jgit.junit.RepositoryTestCase;
@@ -303,6 +308,86 @@ public class RevCommitParseTest extends RepositoryTestCase {
 		assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage());
 	}

 	@Test
 	public void testParse_incorrectUtf8Name() throws Exception {
 		ByteArrayOutputStream b = new ByteArrayOutputStream();
 		b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n"
 				.getBytes(UTF_8));
 		b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8));
 		b.write("committer co <c@example.com> 1218123390 -0500\n"
 				.getBytes(UTF_8));
 		b.write("encoding 'utf8'\n".getBytes(UTF_8));
 		b.write("\n".getBytes(UTF_8));
 		b.write("Sm\u00f6rg\u00e5sbord\n".getBytes(UTF_8));

 		RevCommit c = new RevCommit(
 				id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
 		c.parseCanonical(new RevWalk(db), b.toByteArray());
 		assertEquals("'utf8'", c.getEncodingName());
 		assertEquals("Sm\u00f6rg\u00e5sbord\n", c.getFullMessage());

 		try {
 			c.getEncoding();
 			fail("Expected " + IllegalCharsetNameException.class);
 		} catch (IllegalCharsetNameException badName) {
 			assertEquals("'utf8'", badName.getMessage());
 		}
 	}

 	@Test
 	public void testParse_illegalEncoding() throws Exception {
 		ByteArrayOutputStream b = new ByteArrayOutputStream();
 		b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
 		b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8));
 		b.write("committer co <c@example.com> 1218123390 -0500\n".getBytes(UTF_8));
 		b.write("encoding utf-8logoutputencoding=gbk\n".getBytes(UTF_8));
 		b.write("\n".getBytes(UTF_8));
 		b.write("message\n".getBytes(UTF_8));

 		RevCommit c = new RevCommit(
 				id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
 		c.parseCanonical(new RevWalk(db), b.toByteArray());
 		assertEquals("utf-8logoutputencoding=gbk", c.getEncodingName());
 		assertEquals("message\n", c.getFullMessage());
 		assertEquals("message", c.getShortMessage());
 		assertTrue(c.getFooterLines().isEmpty());
 		assertEquals("au", c.getAuthorIdent().getName());

 		try {
 			c.getEncoding();
 			fail("Expected " + IllegalCharsetNameException.class);
 		} catch (IllegalCharsetNameException badName) {
 			assertEquals("utf-8logoutputencoding=gbk", badName.getMessage());
 		}
 	}

 	@Test
 	public void testParse_unsupportedEncoding() throws Exception {
 		ByteArrayOutputStream b = new ByteArrayOutputStream();
 		b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
 		b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8));
 		b.write("committer co <c@example.com> 1218123390 -0500\n".getBytes(UTF_8));
 		b.write("encoding it_IT.UTF8\n".getBytes(UTF_8));
 		b.write("\n".getBytes(UTF_8));
 		b.write("message\n".getBytes(UTF_8));

 		RevCommit c = new RevCommit(
 				id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
 		c.parseCanonical(new RevWalk(db), b.toByteArray());
 		assertEquals("it_IT.UTF8", c.getEncodingName());
 		assertEquals("message\n", c.getFullMessage());
 		assertEquals("message", c.getShortMessage());
 		assertTrue(c.getFooterLines().isEmpty());
 		assertEquals("au", c.getAuthorIdent().getName());

 		try {
 			c.getEncoding();
 			fail("Expected " + UnsupportedCharsetException.class);
 		} catch (UnsupportedCharsetException badName) {
 			assertEquals("it_IT.UTF8", badName.getMessage());
 		}
 	}

 	@Test
 	public void testParse_NoMessage() throws Exception {
 		final String msg = "";
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevTagParseTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevTagParseTest.java
@@ -43,6 +43,7 @@

 package org.eclipse.jgit.revwalk;

 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
@@ -361,6 +362,44 @@ public class RevTagParseTest extends RepositoryTestCase {
 		assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage());
 	}

 	@Test
 	public void testParse_illegalEncoding() throws Exception {
 		ByteArrayOutputStream b = new ByteArrayOutputStream();
 		b.write("object 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
 		b.write("type tree\n".getBytes(UTF_8));
 		b.write("tag v1.0\n".getBytes(UTF_8));
 		b.write("tagger t <t@example.com> 1218123387 +0700\n".getBytes(UTF_8));
 		b.write("encoding utf-8logoutputencoding=gbk\n".getBytes(UTF_8));
 		b.write("\n".getBytes(UTF_8));
 		b.write("message\n".getBytes(UTF_8));

 		RevTag t = new RevTag(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
 		t.parseCanonical(new RevWalk(db), b.toByteArray());

 		assertEquals("t", t.getTaggerIdent().getName());
 		assertEquals("message", t.getShortMessage());
 		assertEquals("message\n", t.getFullMessage());
 	}

 	@Test
 	public void testParse_unsupportedEncoding() throws Exception {
 		ByteArrayOutputStream b = new ByteArrayOutputStream();
 		b.write("object 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
 		b.write("type tree\n".getBytes(UTF_8));
 		b.write("tag v1.0\n".getBytes(UTF_8));
 		b.write("tagger t <t@example.com> 1218123387 +0700\n".getBytes(UTF_8));
 		b.write("encoding it_IT.UTF8\n".getBytes(UTF_8));
 		b.write("\n".getBytes(UTF_8));
 		b.write("message\n".getBytes(UTF_8));

 		RevTag t = new RevTag(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
 		t.parseCanonical(new RevWalk(db), b.toByteArray());

 		assertEquals("t", t.getTaggerIdent().getName());
 		assertEquals("message", t.getShortMessage());
 		assertEquals("message\n", t.getFullMessage());
 	}

 	@Test
 	public void testParse_NoMessage() throws Exception {
 		final String msg = "";
--- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java
@@ -44,12 +44,17 @@

 package org.eclipse.jgit.revwalk;

 import static java.nio.charset.StandardCharsets.UTF_8;

 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;

 import org.eclipse.jgit.annotations.Nullable;
 import org.eclipse.jgit.errors.IncorrectObjectTypeException;
 import org.eclipse.jgit.errors.MissingObjectException;
 import org.eclipse.jgit.lib.AnyObjectId;
@@ -441,12 +446,12 @@ public class RevCommit extends RevObject {
 	 * @return decoded commit message as a string. Never null.
 	 */
 	public final String getFullMessage() {
 		final byte[] raw = buffer;
 		final int msgB = RawParseUtils.commitMessage(raw, 0);
 		if (msgB < 0)
 		byte[] raw = buffer;
 		int msgB = RawParseUtils.commitMessage(raw, 0);
 		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
 		final Charset enc = RawParseUtils.parseEncoding(raw);
 		return RawParseUtils.decode(enc, raw, msgB, raw.length);
 		}
 		return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length);
 	}

 	/**
@@ -465,16 +470,17 @@ public class RevCommit extends RevObject {
 	 *         spanned multiple lines. Embedded LFs are converted to spaces.
 	 */
 	public final String getShortMessage() {
 		final byte[] raw = buffer;
 		final int msgB = RawParseUtils.commitMessage(raw, 0);
 		if (msgB < 0)
 		byte[] raw = buffer;
 		int msgB = RawParseUtils.commitMessage(raw, 0);
 		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
 		}

 		final Charset enc = RawParseUtils.parseEncoding(raw);
 		final int msgE = RawParseUtils.endOfParagraph(raw, msgB);
 		String str = RawParseUtils.decode(enc, raw, msgB, msgE);
 		if (hasLF(raw, msgB, msgE))
 		int msgE = RawParseUtils.endOfParagraph(raw, msgB);
 		String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE);
 		if (hasLF(raw, msgB, msgE)) {
 			str = StringUtils.replaceLineBreaksWithSpace(str);
 		}
 		return str;
 	}

@@ -485,6 +491,23 @@ public class RevCommit extends RevObject {
 		return false;
 	}

 	/**
 	 * Determine the encoding of the commit message buffer.
 	 * <p>
 	 * Locates the "encoding" header (if present) and returns its value. Due to
 	 * corruption in the wild this may be an invalid encoding name that is not
 	 * recognized by any character encoding library.
 	 * <p>
 	 * If no encoding header is present, null.
 	 *
 	 * @return the preferred encoding of {@link #getRawBuffer()}; or null.
 	 * @since 4.2
 	 */
 	@Nullable
 	public final String getEncodingName() {
 		return RawParseUtils.parseEncodingName(buffer);
 	}

 	/**
 	 * Determine the encoding of the commit message buffer.
 	 * <p>
@@ -492,14 +515,28 @@ public class RevCommit extends RevObject {
 	 * character set to apply to this buffer to evaluate its contents as
 	 * character data.
 	 * <p>
 	 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
 	 * If no encoding header is present {@code UTF-8} is assumed.
 	 *
 	 * @return the preferred encoding of {@link #getRawBuffer()}.
 	 * @throws IllegalCharsetNameException
 	 *             if the character set requested by the encoding header is
 	 *             malformed and unsupportable.
 	 * @throws UnsupportedCharsetException
 	 *             if the JRE does not support the character set requested by
 	 *             the encoding header.
 	 */
 	public final Charset getEncoding() {
 		return RawParseUtils.parseEncoding(buffer);
 	}

 	private Charset guessEncoding() {
 		try {
 			return getEncoding();
 		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
 			return UTF_8;
 		}
 	}

 	/**
 	 * Parse the footer lines (e.g. "Signed-off-by") for machine processing.
 	 * <p>
@@ -529,7 +566,7 @@ public class RevCommit extends RevObject {

 		final int msgB = RawParseUtils.commitMessage(raw, 0);
 		final ArrayList<FooterLine> r = new ArrayList<FooterLine>(4);
 		final Charset enc = getEncoding();
 		final Charset enc = guessEncoding();
 		for (;;) {
 			ptr = RawParseUtils.prevLF(raw, ptr);
 			if (ptr <= msgB)
--- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java
@@ -45,8 +45,12 @@

 package org.eclipse.jgit.revwalk;

 import static java.nio.charset.StandardCharsets.UTF_8;

 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;

 import org.eclipse.jgit.errors.CorruptObjectException;
 import org.eclipse.jgit.errors.IncorrectObjectTypeException;
@@ -162,7 +166,7 @@ public class RevTag extends RevObject {

 		int p = pos.value += 4; // "tag "
 		final int nameEnd = RawParseUtils.nextLF(rawTag, p) - 1;
 		tagName = RawParseUtils.decode(Constants.CHARSET, rawTag, p, nameEnd);
 		tagName = RawParseUtils.decode(UTF_8, rawTag, p, nameEnd);

 		if (walk.isRetainBody())
 			buffer = rawTag;
@@ -207,12 +211,12 @@ public class RevTag extends RevObject {
 	 * @return decoded tag message as a string. Never null.
 	 */
 	public final String getFullMessage() {
 		final byte[] raw = buffer;
 		final int msgB = RawParseUtils.tagMessage(raw, 0);
 		if (msgB < 0)
 		byte[] raw = buffer;
 		int msgB = RawParseUtils.tagMessage(raw, 0);
 		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
 		final Charset enc = RawParseUtils.parseEncoding(raw);
 		return RawParseUtils.decode(enc, raw, msgB, raw.length);
 		}
 		return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length);
 	}

 	/**
@@ -231,19 +235,28 @@ public class RevTag extends RevObject {
 	 *         multiple lines. Embedded LFs are converted to spaces.
 	 */
 	public final String getShortMessage() {
 		final byte[] raw = buffer;
 		final int msgB = RawParseUtils.tagMessage(raw, 0);
 		if (msgB < 0)
 		byte[] raw = buffer;
 		int msgB = RawParseUtils.tagMessage(raw, 0);
 		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
 		}

 		final Charset enc = RawParseUtils.parseEncoding(raw);
 		final int msgE = RawParseUtils.endOfParagraph(raw, msgB);
 		String str = RawParseUtils.decode(enc, raw, msgB, msgE);
 		if (RevCommit.hasLF(raw, msgB, msgE))
 		int msgE = RawParseUtils.endOfParagraph(raw, msgB);
 		String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE);
 		if (RevCommit.hasLF(raw, msgB, msgE)) {
 			str = StringUtils.replaceLineBreaksWithSpace(str);
 		}
 		return str;
 	}

 	private Charset guessEncoding() {
 		try {
 			return RawParseUtils.parseEncoding(buffer);
 		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
 			return UTF_8;
 		}
 	}

 	/**
 	 * Get a reference to the object this tag was placed on.
 	 * <p>
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java
@@ -44,6 +44,8 @@

 package org.eclipse.jgit.util;

 import static java.nio.charset.StandardCharsets.ISO_8859_1;
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.eclipse.jgit.lib.ObjectChecker.author;
 import static org.eclipse.jgit.lib.ObjectChecker.committer;
 import static org.eclipse.jgit.lib.ObjectChecker.encoding;
@@ -60,6 +62,7 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;

 import org.eclipse.jgit.annotations.Nullable;
 import org.eclipse.jgit.lib.Constants;
 import org.eclipse.jgit.lib.PersonIdent;

@@ -70,7 +73,7 @@ public final class RawParseUtils {
 	 *
 	 * @since 2.2
 	 */
 	public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$
 	public static final Charset UTF8_CHARSET = UTF_8;

 	private static final byte[] digits10;

@@ -81,9 +84,9 @@ public final class RawParseUtils {
 	private static final Map<String, Charset> encodingAliases;

 	static {
 		encodingAliases = new HashMap<String, Charset>();
 		encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$
 		encodingAliases.put("'utf8'", Charset.forName("UTF-8")); //$NON-NLS-1$ //$NON-NLS-2$
 		encodingAliases = new HashMap<>();
 		encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
 		encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$

 		digits10 = new byte['9' + 1];
 		Arrays.fill(digits10, (byte) -1);
@@ -671,6 +674,27 @@ public final class RawParseUtils {
 		return match(b, ptr, encoding);
 	}

 	/**
 	 * Parse the "encoding " header as a string.
 	 * <p>
 	 * Locates the "encoding " header (if present) and returns its value.
 	 *
 	 * @param b
 	 *            buffer to scan.
 	 * @return the encoding header as specified in the commit; null if the
 	 *         header was not present and should be assumed.
 	 * @since 4.2
 	 */
 	@Nullable
 	public static String parseEncodingName(final byte[] b) {
 		int enc = encoding(b, 0);
 		if (enc < 0) {
 			return null;
 		}
 		int lf = nextLF(b, enc);
 		return decode(UTF_8, b, enc, lf - 1);
 	}

 	/**
 	 * Parse the "encoding " header into a character set reference.
 	 * <p>
@@ -678,29 +702,33 @@ public final class RawParseUtils {
 	 * {@link #encoding(byte[], int)} and then returns the proper character set
 	 * to apply to this buffer to evaluate its contents as character data.
 	 * <p>
 	 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
 	 * If no encoding header is present {@code UTF-8} is assumed.
 	 *
 	 * @param b
 	 *            buffer to scan.
 	 * @return the Java character set representation. Never null.
 	 * @throws IllegalCharsetNameException
 	 *             if the character set requested by the encoding header is
 	 *             malformed and unsupportable.
 	 * @throws UnsupportedCharsetException
 	 *             if the JRE does not support the character set requested by
 	 *             the encoding header.
 	 */
 	public static Charset parseEncoding(final byte[] b) {
 		final int enc = encoding(b, 0);
 		if (enc < 0)
 			return Constants.CHARSET;
 		final int lf = nextLF(b, enc);
 		String decoded = decode(Constants.CHARSET, b, enc, lf - 1);
 		String enc = parseEncodingName(b);
 		if (enc == null) {
 			return UTF_8;
 		}

 		String name = enc.trim();
 		try {
 			return Charset.forName(decoded);
 		} catch (IllegalCharsetNameException badName) {
 			Charset aliased = charsetForAlias(decoded);
 			if (aliased != null)
 				return aliased;
 			throw badName;
 		} catch (UnsupportedCharsetException badName) {
 			Charset aliased = charsetForAlias(decoded);
 			if (aliased != null)
 			return Charset.forName(name);
 		} catch (IllegalCharsetNameException
 				| UnsupportedCharsetException badName) {
 			Charset aliased = charsetForAlias(name);
 			if (aliased != null) {
 				return aliased;
 			}
 			throw badName;
 		}
 	}
@@ -739,7 +767,15 @@ public final class RawParseUtils {
 	 *         parsed.
 	 */
 	public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
 		final Charset cs = parseEncoding(raw);
 		Charset cs;
 		try {
 			cs = parseEncoding(raw);
 		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
 			// Assume UTF-8 for person identities, usually this is correct.
 			// If not decode() will fall back to the ISO-8859-1 encoding.
 			cs = UTF_8;
 		}

 		final int emailB = nextLF(raw, nameB, '<');
 		final int emailE = nextLF(raw, emailB, '>');
 		if (emailB >= raw.length || raw[emailB] == '\n' ||
@@ -887,7 +923,7 @@ public final class RawParseUtils {
 	 */
 	public static String decode(final byte[] buffer, final int start,
 			final int end) {
 		return decode(Constants.CHARSET, buffer, start, end);
 		return decode(UTF_8, buffer, start, end);
 	}

 	/**
@@ -961,23 +997,21 @@ public final class RawParseUtils {
 	public static String decodeNoFallback(final Charset cs,
 			final byte[] buffer, final int start, final int end)
 			throws CharacterCodingException {
 		final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
 		ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
 		b.mark();

 		// Try our built-in favorite. The assumption here is that
 		// decoding will fail if the data is not actually encoded
 		// using that encoder.
 		//
 		try {
 			return decode(b, Constants.CHARSET);
 			return decode(b, UTF_8);
 		} catch (CharacterCodingException e) {
 			b.reset();
 		}

 		if (!cs.equals(Constants.CHARSET)) {
 		if (!cs.equals(UTF_8)) {
 			// Try the suggested encoding, it might be right since it was
 			// provided by the caller.
 			//
 			try {
 				return decode(b, cs);
 			} catch (CharacterCodingException e) {
@@ -987,9 +1021,8 @@ public final class RawParseUtils {

 		// Try the default character set. A small group of people
 		// might actually use the same (or very similar) locale.
 		//
 		final Charset defcs = Charset.defaultCharset();
 		if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
 		Charset defcs = Charset.defaultCharset();
 		if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
 			try {
 				return decode(b, defcs);
 			} catch (CharacterCodingException e) {