Browse Source

RevCommit: Better support invalid encoding headers

With this support we no longer need the 'utf-8' alias. UTF-8 will be
automatically tried when the encoding header is not recognized and used
if the character sequence cleanly decodes as UTF-8.

Modernize some of the references to use StandardCharsets.

Change-Id: I4c0c88750475560e1f2263180c4a98eb8febeca0
tags/v4.2.0.201601211800-r
Shawn Pearce 8 years ago
parent
commit
31d92ace5b

+ 85
- 0
org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevCommitParseTest.java View File

@@ -43,13 +43,18 @@

package org.eclipse.jgit.revwalk;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.ByteArrayOutputStream;
import java.io.UnsupportedEncodingException;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.TimeZone;

import org.eclipse.jgit.junit.RepositoryTestCase;
@@ -303,6 +308,86 @@ public class RevCommitParseTest extends RepositoryTestCase {
assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage());
}

@Test
public void testParse_incorrectUtf8Name() throws Exception {
ByteArrayOutputStream b = new ByteArrayOutputStream();
b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n"
.getBytes(UTF_8));
b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8));
b.write("committer co <c@example.com> 1218123390 -0500\n"
.getBytes(UTF_8));
b.write("encoding 'utf8'\n".getBytes(UTF_8));
b.write("\n".getBytes(UTF_8));
b.write("Sm\u00f6rg\u00e5sbord\n".getBytes(UTF_8));

RevCommit c = new RevCommit(
id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
c.parseCanonical(new RevWalk(db), b.toByteArray());
assertEquals("'utf8'", c.getEncodingName());
assertEquals("Sm\u00f6rg\u00e5sbord\n", c.getFullMessage());

try {
c.getEncoding();
fail("Expected " + IllegalCharsetNameException.class);
} catch (IllegalCharsetNameException badName) {
assertEquals("'utf8'", badName.getMessage());
}
}

@Test
public void testParse_illegalEncoding() throws Exception {
ByteArrayOutputStream b = new ByteArrayOutputStream();
b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8));
b.write("committer co <c@example.com> 1218123390 -0500\n".getBytes(UTF_8));
b.write("encoding utf-8logoutputencoding=gbk\n".getBytes(UTF_8));
b.write("\n".getBytes(UTF_8));
b.write("message\n".getBytes(UTF_8));

RevCommit c = new RevCommit(
id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
c.parseCanonical(new RevWalk(db), b.toByteArray());
assertEquals("utf-8logoutputencoding=gbk", c.getEncodingName());
assertEquals("message\n", c.getFullMessage());
assertEquals("message", c.getShortMessage());
assertTrue(c.getFooterLines().isEmpty());
assertEquals("au", c.getAuthorIdent().getName());

try {
c.getEncoding();
fail("Expected " + IllegalCharsetNameException.class);
} catch (IllegalCharsetNameException badName) {
assertEquals("utf-8logoutputencoding=gbk", badName.getMessage());
}
}

@Test
public void testParse_unsupportedEncoding() throws Exception {
ByteArrayOutputStream b = new ByteArrayOutputStream();
b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8));
b.write("committer co <c@example.com> 1218123390 -0500\n".getBytes(UTF_8));
b.write("encoding it_IT.UTF8\n".getBytes(UTF_8));
b.write("\n".getBytes(UTF_8));
b.write("message\n".getBytes(UTF_8));

RevCommit c = new RevCommit(
id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
c.parseCanonical(new RevWalk(db), b.toByteArray());
assertEquals("it_IT.UTF8", c.getEncodingName());
assertEquals("message\n", c.getFullMessage());
assertEquals("message", c.getShortMessage());
assertTrue(c.getFooterLines().isEmpty());
assertEquals("au", c.getAuthorIdent().getName());

try {
c.getEncoding();
fail("Expected " + UnsupportedCharsetException.class);
} catch (UnsupportedCharsetException badName) {
assertEquals("it_IT.UTF8", badName.getMessage());
}
}

@Test
public void testParse_NoMessage() throws Exception {
final String msg = "";

+ 39
- 0
org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevTagParseTest.java View File

@@ -43,6 +43,7 @@

package org.eclipse.jgit.revwalk;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
@@ -361,6 +362,44 @@ public class RevTagParseTest extends RepositoryTestCase {
assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage());
}

@Test
public void testParse_illegalEncoding() throws Exception {
ByteArrayOutputStream b = new ByteArrayOutputStream();
b.write("object 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
b.write("type tree\n".getBytes(UTF_8));
b.write("tag v1.0\n".getBytes(UTF_8));
b.write("tagger t <t@example.com> 1218123387 +0700\n".getBytes(UTF_8));
b.write("encoding utf-8logoutputencoding=gbk\n".getBytes(UTF_8));
b.write("\n".getBytes(UTF_8));
b.write("message\n".getBytes(UTF_8));

RevTag t = new RevTag(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
t.parseCanonical(new RevWalk(db), b.toByteArray());

assertEquals("t", t.getTaggerIdent().getName());
assertEquals("message", t.getShortMessage());
assertEquals("message\n", t.getFullMessage());
}

@Test
public void testParse_unsupportedEncoding() throws Exception {
ByteArrayOutputStream b = new ByteArrayOutputStream();
b.write("object 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
b.write("type tree\n".getBytes(UTF_8));
b.write("tag v1.0\n".getBytes(UTF_8));
b.write("tagger t <t@example.com> 1218123387 +0700\n".getBytes(UTF_8));
b.write("encoding it_IT.UTF8\n".getBytes(UTF_8));
b.write("\n".getBytes(UTF_8));
b.write("message\n".getBytes(UTF_8));

RevTag t = new RevTag(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
t.parseCanonical(new RevWalk(db), b.toByteArray());

assertEquals("t", t.getTaggerIdent().getName());
assertEquals("message", t.getShortMessage());
assertEquals("message\n", t.getFullMessage());
}

@Test
public void testParse_NoMessage() throws Exception {
final String msg = "";

+ 51
- 14
org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java View File

@@ -44,12 +44,17 @@

package org.eclipse.jgit.revwalk;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.eclipse.jgit.annotations.Nullable;
import org.eclipse.jgit.errors.IncorrectObjectTypeException;
import org.eclipse.jgit.errors.MissingObjectException;
import org.eclipse.jgit.lib.AnyObjectId;
@@ -441,12 +446,12 @@ public class RevCommit extends RevObject {
* @return decoded commit message as a string. Never null.
*/
public final String getFullMessage() {
final byte[] raw = buffer;
final int msgB = RawParseUtils.commitMessage(raw, 0);
if (msgB < 0)
byte[] raw = buffer;
int msgB = RawParseUtils.commitMessage(raw, 0);
if (msgB < 0) {
return ""; //$NON-NLS-1$
final Charset enc = RawParseUtils.parseEncoding(raw);
return RawParseUtils.decode(enc, raw, msgB, raw.length);
}
return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length);
}

/**
@@ -465,16 +470,17 @@ public class RevCommit extends RevObject {
* spanned multiple lines. Embedded LFs are converted to spaces.
*/
public final String getShortMessage() {
final byte[] raw = buffer;
final int msgB = RawParseUtils.commitMessage(raw, 0);
if (msgB < 0)
byte[] raw = buffer;
int msgB = RawParseUtils.commitMessage(raw, 0);
if (msgB < 0) {
return ""; //$NON-NLS-1$
}

final Charset enc = RawParseUtils.parseEncoding(raw);
final int msgE = RawParseUtils.endOfParagraph(raw, msgB);
String str = RawParseUtils.decode(enc, raw, msgB, msgE);
if (hasLF(raw, msgB, msgE))
int msgE = RawParseUtils.endOfParagraph(raw, msgB);
String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE);
if (hasLF(raw, msgB, msgE)) {
str = StringUtils.replaceLineBreaksWithSpace(str);
}
return str;
}

@@ -485,6 +491,23 @@ public class RevCommit extends RevObject {
return false;
}

/**
* Determine the encoding of the commit message buffer.
* <p>
* Locates the "encoding" header (if present) and returns its value. Due to
* corruption in the wild this may be an invalid encoding name that is not
* recognized by any character encoding library.
* <p>
* If no encoding header is present, null.
*
* @return the preferred encoding of {@link #getRawBuffer()}; or null.
* @since 4.2
*/
@Nullable
public final String getEncodingName() {
return RawParseUtils.parseEncodingName(buffer);
}

/**
* Determine the encoding of the commit message buffer.
* <p>
@@ -492,14 +515,28 @@ public class RevCommit extends RevObject {
* character set to apply to this buffer to evaluate its contents as
* character data.
* <p>
* If no encoding header is present, {@link Constants#CHARSET} is assumed.
* If no encoding header is present {@code UTF-8} is assumed.
*
* @return the preferred encoding of {@link #getRawBuffer()}.
* @throws IllegalCharsetNameException
* if the character set requested by the encoding header is
* malformed and unsupportable.
* @throws UnsupportedCharsetException
* if the JRE does not support the character set requested by
* the encoding header.
*/
public final Charset getEncoding() {
return RawParseUtils.parseEncoding(buffer);
}

private Charset guessEncoding() {
try {
return getEncoding();
} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
return UTF_8;
}
}

/**
* Parse the footer lines (e.g. "Signed-off-by") for machine processing.
* <p>
@@ -529,7 +566,7 @@ public class RevCommit extends RevObject {

final int msgB = RawParseUtils.commitMessage(raw, 0);
final ArrayList<FooterLine> r = new ArrayList<FooterLine>(4);
final Charset enc = getEncoding();
final Charset enc = guessEncoding();
for (;;) {
ptr = RawParseUtils.prevLF(raw, ptr);
if (ptr <= msgB)

+ 26
- 13
org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java View File

@@ -45,8 +45,12 @@

package org.eclipse.jgit.revwalk;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;

import org.eclipse.jgit.errors.CorruptObjectException;
import org.eclipse.jgit.errors.IncorrectObjectTypeException;
@@ -162,7 +166,7 @@ public class RevTag extends RevObject {

int p = pos.value += 4; // "tag "
final int nameEnd = RawParseUtils.nextLF(rawTag, p) - 1;
tagName = RawParseUtils.decode(Constants.CHARSET, rawTag, p, nameEnd);
tagName = RawParseUtils.decode(UTF_8, rawTag, p, nameEnd);

if (walk.isRetainBody())
buffer = rawTag;
@@ -207,12 +211,12 @@ public class RevTag extends RevObject {
* @return decoded tag message as a string. Never null.
*/
public final String getFullMessage() {
final byte[] raw = buffer;
final int msgB = RawParseUtils.tagMessage(raw, 0);
if (msgB < 0)
byte[] raw = buffer;
int msgB = RawParseUtils.tagMessage(raw, 0);
if (msgB < 0) {
return ""; //$NON-NLS-1$
final Charset enc = RawParseUtils.parseEncoding(raw);
return RawParseUtils.decode(enc, raw, msgB, raw.length);
}
return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length);
}

/**
@@ -231,19 +235,28 @@ public class RevTag extends RevObject {
* multiple lines. Embedded LFs are converted to spaces.
*/
public final String getShortMessage() {
final byte[] raw = buffer;
final int msgB = RawParseUtils.tagMessage(raw, 0);
if (msgB < 0)
byte[] raw = buffer;
int msgB = RawParseUtils.tagMessage(raw, 0);
if (msgB < 0) {
return ""; //$NON-NLS-1$
}

final Charset enc = RawParseUtils.parseEncoding(raw);
final int msgE = RawParseUtils.endOfParagraph(raw, msgB);
String str = RawParseUtils.decode(enc, raw, msgB, msgE);
if (RevCommit.hasLF(raw, msgB, msgE))
int msgE = RawParseUtils.endOfParagraph(raw, msgB);
String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE);
if (RevCommit.hasLF(raw, msgB, msgE)) {
str = StringUtils.replaceLineBreaksWithSpace(str);
}
return str;
}

private Charset guessEncoding() {
try {
return RawParseUtils.parseEncoding(buffer);
} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
return UTF_8;
}
}

/**
* Get a reference to the object this tag was placed on.
* <p>

+ 62
- 29
org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java View File

@@ -44,6 +44,8 @@

package org.eclipse.jgit.util;

import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.eclipse.jgit.lib.ObjectChecker.author;
import static org.eclipse.jgit.lib.ObjectChecker.committer;
import static org.eclipse.jgit.lib.ObjectChecker.encoding;
@@ -60,6 +62,7 @@ import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import org.eclipse.jgit.annotations.Nullable;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.PersonIdent;

@@ -70,7 +73,7 @@ public final class RawParseUtils {
*
* @since 2.2
*/
public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$
public static final Charset UTF8_CHARSET = UTF_8;

private static final byte[] digits10;

@@ -81,9 +84,9 @@ public final class RawParseUtils {
private static final Map<String, Charset> encodingAliases;

static {
encodingAliases = new HashMap<String, Charset>();
encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$
encodingAliases.put("'utf8'", Charset.forName("UTF-8")); //$NON-NLS-1$ //$NON-NLS-2$
encodingAliases = new HashMap<>();
encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$

digits10 = new byte['9' + 1];
Arrays.fill(digits10, (byte) -1);
@@ -671,6 +674,27 @@ public final class RawParseUtils {
return match(b, ptr, encoding);
}

/**
* Parse the "encoding " header as a string.
* <p>
* Locates the "encoding " header (if present) and returns its value.
*
* @param b
* buffer to scan.
* @return the encoding header as specified in the commit; null if the
* header was not present and should be assumed.
* @since 4.2
*/
@Nullable
public static String parseEncodingName(final byte[] b) {
int enc = encoding(b, 0);
if (enc < 0) {
return null;
}
int lf = nextLF(b, enc);
return decode(UTF_8, b, enc, lf - 1);
}

/**
* Parse the "encoding " header into a character set reference.
* <p>
@@ -678,29 +702,33 @@ public final class RawParseUtils {
* {@link #encoding(byte[], int)} and then returns the proper character set
* to apply to this buffer to evaluate its contents as character data.
* <p>
* If no encoding header is present, {@link Constants#CHARSET} is assumed.
* If no encoding header is present {@code UTF-8} is assumed.
*
* @param b
* buffer to scan.
* @return the Java character set representation. Never null.
* @throws IllegalCharsetNameException
* if the character set requested by the encoding header is
* malformed and unsupportable.
* @throws UnsupportedCharsetException
* if the JRE does not support the character set requested by
* the encoding header.
*/
public static Charset parseEncoding(final byte[] b) {
final int enc = encoding(b, 0);
if (enc < 0)
return Constants.CHARSET;
final int lf = nextLF(b, enc);
String decoded = decode(Constants.CHARSET, b, enc, lf - 1);
String enc = parseEncodingName(b);
if (enc == null) {
return UTF_8;
}

String name = enc.trim();
try {
return Charset.forName(decoded);
} catch (IllegalCharsetNameException badName) {
Charset aliased = charsetForAlias(decoded);
if (aliased != null)
return aliased;
throw badName;
} catch (UnsupportedCharsetException badName) {
Charset aliased = charsetForAlias(decoded);
if (aliased != null)
return Charset.forName(name);
} catch (IllegalCharsetNameException
| UnsupportedCharsetException badName) {
Charset aliased = charsetForAlias(name);
if (aliased != null) {
return aliased;
}
throw badName;
}
}
@@ -739,7 +767,15 @@ public final class RawParseUtils {
* parsed.
*/
public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
final Charset cs = parseEncoding(raw);
Charset cs;
try {
cs = parseEncoding(raw);
} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
// Assume UTF-8 for person identities, usually this is correct.
// If not decode() will fall back to the ISO-8859-1 encoding.
cs = UTF_8;
}

final int emailB = nextLF(raw, nameB, '<');
final int emailE = nextLF(raw, emailB, '>');
if (emailB >= raw.length || raw[emailB] == '\n' ||
@@ -887,7 +923,7 @@ public final class RawParseUtils {
*/
public static String decode(final byte[] buffer, final int start,
final int end) {
return decode(Constants.CHARSET, buffer, start, end);
return decode(UTF_8, buffer, start, end);
}

/**
@@ -961,23 +997,21 @@ public final class RawParseUtils {
public static String decodeNoFallback(final Charset cs,
final byte[] buffer, final int start, final int end)
throws CharacterCodingException {
final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
b.mark();

// Try our built-in favorite. The assumption here is that
// decoding will fail if the data is not actually encoded
// using that encoder.
//
try {
return decode(b, Constants.CHARSET);
return decode(b, UTF_8);
} catch (CharacterCodingException e) {
b.reset();
}

if (!cs.equals(Constants.CHARSET)) {
if (!cs.equals(UTF_8)) {
// Try the suggested encoding, it might be right since it was
// provided by the caller.
//
try {
return decode(b, cs);
} catch (CharacterCodingException e) {
@@ -987,9 +1021,8 @@ public final class RawParseUtils {

// Try the default character set. A small group of people
// might actually use the same (or very similar) locale.
//
final Charset defcs = Charset.defaultCharset();
if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
Charset defcs = Charset.defaultCharset();
if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
try {
return decode(b, defcs);
} catch (CharacterCodingException e) {

Loading…
Cancel
Save