diff options
author | Shawn Pearce <spearce@spearce.org> | 2016-01-11 12:30:35 -0800 |
---|---|---|
committer | Shawn Pearce <spearce@spearce.org> | 2016-01-11 12:30:35 -0800 |
commit | 31d92ace5b0b4ca66b3fa191b8a1207d6e8fecc6 (patch) | |
tree | fd3ae6103257768062d2f0f6a31018b3d8f0122f /org.eclipse.jgit | |
parent | 24cd8e170d377cef87166d43ca25bfa2340755c6 (diff) | |
download | jgit-31d92ace5b0b4ca66b3fa191b8a1207d6e8fecc6.tar.gz jgit-31d92ace5b0b4ca66b3fa191b8a1207d6e8fecc6.zip |
RevCommit: Better support invalid encoding headers
With this support we no longer need the 'utf-8' alias. UTF-8 will be
automatically tried when the encoding header is not recognized and used
if the character sequence cleanly decodes as UTF-8.
Modernize some of the references to use StandardCharsets.
Change-Id: I4c0c88750475560e1f2263180c4a98eb8febeca0
Diffstat (limited to 'org.eclipse.jgit')
3 files changed, 139 insertions, 56 deletions
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java index c23e4e3288..e67ada6022 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java @@ -44,12 +44,17 @@ package org.eclipse.jgit.revwalk; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.IOException; import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import org.eclipse.jgit.annotations.Nullable; import org.eclipse.jgit.errors.IncorrectObjectTypeException; import org.eclipse.jgit.errors.MissingObjectException; import org.eclipse.jgit.lib.AnyObjectId; @@ -441,12 +446,12 @@ public class RevCommit extends RevObject { * @return decoded commit message as a string. Never null. */ public final String getFullMessage() { - final byte[] raw = buffer; - final int msgB = RawParseUtils.commitMessage(raw, 0); - if (msgB < 0) + byte[] raw = buffer; + int msgB = RawParseUtils.commitMessage(raw, 0); + if (msgB < 0) { return ""; //$NON-NLS-1$ - final Charset enc = RawParseUtils.parseEncoding(raw); - return RawParseUtils.decode(enc, raw, msgB, raw.length); + } + return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length); } /** @@ -465,16 +470,17 @@ public class RevCommit extends RevObject { * spanned multiple lines. Embedded LFs are converted to spaces. */ public final String getShortMessage() { - final byte[] raw = buffer; - final int msgB = RawParseUtils.commitMessage(raw, 0); - if (msgB < 0) + byte[] raw = buffer; + int msgB = RawParseUtils.commitMessage(raw, 0); + if (msgB < 0) { return ""; //$NON-NLS-1$ + } - final Charset enc = RawParseUtils.parseEncoding(raw); - final int msgE = RawParseUtils.endOfParagraph(raw, msgB); - String str = RawParseUtils.decode(enc, raw, msgB, msgE); - if (hasLF(raw, msgB, msgE)) + int msgE = RawParseUtils.endOfParagraph(raw, msgB); + String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE); + if (hasLF(raw, msgB, msgE)) { str = StringUtils.replaceLineBreaksWithSpace(str); + } return str; } @@ -488,18 +494,49 @@ public class RevCommit extends RevObject { /** * Determine the encoding of the commit message buffer. * <p> + * Locates the "encoding" header (if present) and returns its value. Due to + * corruption in the wild this may be an invalid encoding name that is not + * recognized by any character encoding library. + * <p> + * If no encoding header is present, null. + * + * @return the preferred encoding of {@link #getRawBuffer()}; or null. + * @since 4.2 + */ + @Nullable + public final String getEncodingName() { + return RawParseUtils.parseEncodingName(buffer); + } + + /** + * Determine the encoding of the commit message buffer. + * <p> * Locates the "encoding" header (if present) and then returns the proper * character set to apply to this buffer to evaluate its contents as * character data. * <p> - * If no encoding header is present, {@link Constants#CHARSET} is assumed. + * If no encoding header is present {@code UTF-8} is assumed. * * @return the preferred encoding of {@link #getRawBuffer()}. + * @throws IllegalCharsetNameException + * if the character set requested by the encoding header is + * malformed and unsupportable. + * @throws UnsupportedCharsetException + * if the JRE does not support the character set requested by + * the encoding header. */ public final Charset getEncoding() { return RawParseUtils.parseEncoding(buffer); } + private Charset guessEncoding() { + try { + return getEncoding(); + } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { + return UTF_8; + } + } + /** * Parse the footer lines (e.g. "Signed-off-by") for machine processing. * <p> @@ -529,7 +566,7 @@ public class RevCommit extends RevObject { final int msgB = RawParseUtils.commitMessage(raw, 0); final ArrayList<FooterLine> r = new ArrayList<FooterLine>(4); - final Charset enc = getEncoding(); + final Charset enc = guessEncoding(); for (;;) { ptr = RawParseUtils.prevLF(raw, ptr); if (ptr <= msgB) diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java index bf2785e0d7..81a54bf7ea 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java @@ -45,8 +45,12 @@ package org.eclipse.jgit.revwalk; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.IOException; import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; import org.eclipse.jgit.errors.CorruptObjectException; import org.eclipse.jgit.errors.IncorrectObjectTypeException; @@ -162,7 +166,7 @@ public class RevTag extends RevObject { int p = pos.value += 4; // "tag " final int nameEnd = RawParseUtils.nextLF(rawTag, p) - 1; - tagName = RawParseUtils.decode(Constants.CHARSET, rawTag, p, nameEnd); + tagName = RawParseUtils.decode(UTF_8, rawTag, p, nameEnd); if (walk.isRetainBody()) buffer = rawTag; @@ -207,12 +211,12 @@ public class RevTag extends RevObject { * @return decoded tag message as a string. Never null. */ public final String getFullMessage() { - final byte[] raw = buffer; - final int msgB = RawParseUtils.tagMessage(raw, 0); - if (msgB < 0) + byte[] raw = buffer; + int msgB = RawParseUtils.tagMessage(raw, 0); + if (msgB < 0) { return ""; //$NON-NLS-1$ - final Charset enc = RawParseUtils.parseEncoding(raw); - return RawParseUtils.decode(enc, raw, msgB, raw.length); + } + return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length); } /** @@ -231,19 +235,28 @@ public class RevTag extends RevObject { * multiple lines. Embedded LFs are converted to spaces. */ public final String getShortMessage() { - final byte[] raw = buffer; - final int msgB = RawParseUtils.tagMessage(raw, 0); - if (msgB < 0) + byte[] raw = buffer; + int msgB = RawParseUtils.tagMessage(raw, 0); + if (msgB < 0) { return ""; //$NON-NLS-1$ + } - final Charset enc = RawParseUtils.parseEncoding(raw); - final int msgE = RawParseUtils.endOfParagraph(raw, msgB); - String str = RawParseUtils.decode(enc, raw, msgB, msgE); - if (RevCommit.hasLF(raw, msgB, msgE)) + int msgE = RawParseUtils.endOfParagraph(raw, msgB); + String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE); + if (RevCommit.hasLF(raw, msgB, msgE)) { str = StringUtils.replaceLineBreaksWithSpace(str); + } return str; } + private Charset guessEncoding() { + try { + return RawParseUtils.parseEncoding(buffer); + } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { + return UTF_8; + } + } + /** * Get a reference to the object this tag was placed on. * <p> diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java index a20e0b0603..f2955f7e6b 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java @@ -44,6 +44,8 @@ package org.eclipse.jgit.util; +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.eclipse.jgit.lib.ObjectChecker.author; import static org.eclipse.jgit.lib.ObjectChecker.committer; import static org.eclipse.jgit.lib.ObjectChecker.encoding; @@ -60,6 +62,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Map; +import org.eclipse.jgit.annotations.Nullable; import org.eclipse.jgit.lib.Constants; import org.eclipse.jgit.lib.PersonIdent; @@ -70,7 +73,7 @@ public final class RawParseUtils { * * @since 2.2 */ - public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$ + public static final Charset UTF8_CHARSET = UTF_8; private static final byte[] digits10; @@ -81,9 +84,9 @@ public final class RawParseUtils { private static final Map<String, Charset> encodingAliases; static { - encodingAliases = new HashMap<String, Charset>(); - encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$ - encodingAliases.put("'utf8'", Charset.forName("UTF-8")); //$NON-NLS-1$ //$NON-NLS-2$ + encodingAliases = new HashMap<>(); + encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$ + encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$ digits10 = new byte['9' + 1]; Arrays.fill(digits10, (byte) -1); @@ -672,35 +675,60 @@ public final class RawParseUtils { } /** + * Parse the "encoding " header as a string. + * <p> + * Locates the "encoding " header (if present) and returns its value. + * + * @param b + * buffer to scan. + * @return the encoding header as specified in the commit; null if the + * header was not present and should be assumed. + * @since 4.2 + */ + @Nullable + public static String parseEncodingName(final byte[] b) { + int enc = encoding(b, 0); + if (enc < 0) { + return null; + } + int lf = nextLF(b, enc); + return decode(UTF_8, b, enc, lf - 1); + } + + /** * Parse the "encoding " header into a character set reference. * <p> * Locates the "encoding " header (if present) by first calling * {@link #encoding(byte[], int)} and then returns the proper character set * to apply to this buffer to evaluate its contents as character data. * <p> - * If no encoding header is present, {@link Constants#CHARSET} is assumed. + * If no encoding header is present {@code UTF-8} is assumed. * * @param b * buffer to scan. * @return the Java character set representation. Never null. + * @throws IllegalCharsetNameException + * if the character set requested by the encoding header is + * malformed and unsupportable. + * @throws UnsupportedCharsetException + * if the JRE does not support the character set requested by + * the encoding header. */ public static Charset parseEncoding(final byte[] b) { - final int enc = encoding(b, 0); - if (enc < 0) - return Constants.CHARSET; - final int lf = nextLF(b, enc); - String decoded = decode(Constants.CHARSET, b, enc, lf - 1); + String enc = parseEncodingName(b); + if (enc == null) { + return UTF_8; + } + + String name = enc.trim(); try { - return Charset.forName(decoded); - } catch (IllegalCharsetNameException badName) { - Charset aliased = charsetForAlias(decoded); - if (aliased != null) - return aliased; - throw badName; - } catch (UnsupportedCharsetException badName) { - Charset aliased = charsetForAlias(decoded); - if (aliased != null) + return Charset.forName(name); + } catch (IllegalCharsetNameException + | UnsupportedCharsetException badName) { + Charset aliased = charsetForAlias(name); + if (aliased != null) { return aliased; + } throw badName; } } @@ -739,7 +767,15 @@ public final class RawParseUtils { * parsed. */ public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) { - final Charset cs = parseEncoding(raw); + Charset cs; + try { + cs = parseEncoding(raw); + } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { + // Assume UTF-8 for person identities, usually this is correct. + // If not decode() will fall back to the ISO-8859-1 encoding. + cs = UTF_8; + } + final int emailB = nextLF(raw, nameB, '<'); final int emailE = nextLF(raw, emailB, '>'); if (emailB >= raw.length || raw[emailB] == '\n' || @@ -887,7 +923,7 @@ public final class RawParseUtils { */ public static String decode(final byte[] buffer, final int start, final int end) { - return decode(Constants.CHARSET, buffer, start, end); + return decode(UTF_8, buffer, start, end); } /** @@ -961,23 +997,21 @@ public final class RawParseUtils { public static String decodeNoFallback(final Charset cs, final byte[] buffer, final int start, final int end) throws CharacterCodingException { - final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); + ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); b.mark(); // Try our built-in favorite. The assumption here is that // decoding will fail if the data is not actually encoded // using that encoder. - // try { - return decode(b, Constants.CHARSET); + return decode(b, UTF_8); } catch (CharacterCodingException e) { b.reset(); } - if (!cs.equals(Constants.CHARSET)) { + if (!cs.equals(UTF_8)) { // Try the suggested encoding, it might be right since it was // provided by the caller. - // try { return decode(b, cs); } catch (CharacterCodingException e) { @@ -987,9 +1021,8 @@ public final class RawParseUtils { // Try the default character set. A small group of people // might actually use the same (or very similar) locale. - // - final Charset defcs = Charset.defaultCharset(); - if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) { + Charset defcs = Charset.defaultCharset(); + if (!defcs.equals(cs) && !defcs.equals(UTF_8)) { try { return decode(b, defcs); } catch (CharacterCodingException e) { |