@@ -43,13 +43,18 @@ | |||
package org.eclipse.jgit.revwalk; | |||
import static java.nio.charset.StandardCharsets.UTF_8; | |||
import static org.junit.Assert.assertEquals; | |||
import static org.junit.Assert.assertNotNull; | |||
import static org.junit.Assert.assertNull; | |||
import static org.junit.Assert.assertSame; | |||
import static org.junit.Assert.assertTrue; | |||
import static org.junit.Assert.fail; | |||
import java.io.ByteArrayOutputStream; | |||
import java.io.UnsupportedEncodingException; | |||
import java.nio.charset.IllegalCharsetNameException; | |||
import java.nio.charset.UnsupportedCharsetException; | |||
import java.util.TimeZone; | |||
import org.eclipse.jgit.junit.RepositoryTestCase; | |||
@@ -303,6 +308,86 @@ public class RevCommitParseTest extends RepositoryTestCase { | |||
assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage()); | |||
} | |||
@Test | |||
public void testParse_incorrectUtf8Name() throws Exception { | |||
ByteArrayOutputStream b = new ByteArrayOutputStream(); | |||
b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n" | |||
.getBytes(UTF_8)); | |||
b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8)); | |||
b.write("committer co <c@example.com> 1218123390 -0500\n" | |||
.getBytes(UTF_8)); | |||
b.write("encoding 'utf8'\n".getBytes(UTF_8)); | |||
b.write("\n".getBytes(UTF_8)); | |||
b.write("Sm\u00f6rg\u00e5sbord\n".getBytes(UTF_8)); | |||
RevCommit c = new RevCommit( | |||
id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); | |||
c.parseCanonical(new RevWalk(db), b.toByteArray()); | |||
assertEquals("'utf8'", c.getEncodingName()); | |||
assertEquals("Sm\u00f6rg\u00e5sbord\n", c.getFullMessage()); | |||
try { | |||
c.getEncoding(); | |||
fail("Expected " + IllegalCharsetNameException.class); | |||
} catch (IllegalCharsetNameException badName) { | |||
assertEquals("'utf8'", badName.getMessage()); | |||
} | |||
} | |||
@Test | |||
public void testParse_illegalEncoding() throws Exception { | |||
ByteArrayOutputStream b = new ByteArrayOutputStream(); | |||
b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8)); | |||
b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8)); | |||
b.write("committer co <c@example.com> 1218123390 -0500\n".getBytes(UTF_8)); | |||
b.write("encoding utf-8logoutputencoding=gbk\n".getBytes(UTF_8)); | |||
b.write("\n".getBytes(UTF_8)); | |||
b.write("message\n".getBytes(UTF_8)); | |||
RevCommit c = new RevCommit( | |||
id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); | |||
c.parseCanonical(new RevWalk(db), b.toByteArray()); | |||
assertEquals("utf-8logoutputencoding=gbk", c.getEncodingName()); | |||
assertEquals("message\n", c.getFullMessage()); | |||
assertEquals("message", c.getShortMessage()); | |||
assertTrue(c.getFooterLines().isEmpty()); | |||
assertEquals("au", c.getAuthorIdent().getName()); | |||
try { | |||
c.getEncoding(); | |||
fail("Expected " + IllegalCharsetNameException.class); | |||
} catch (IllegalCharsetNameException badName) { | |||
assertEquals("utf-8logoutputencoding=gbk", badName.getMessage()); | |||
} | |||
} | |||
@Test | |||
public void testParse_unsupportedEncoding() throws Exception { | |||
ByteArrayOutputStream b = new ByteArrayOutputStream(); | |||
b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8)); | |||
b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8)); | |||
b.write("committer co <c@example.com> 1218123390 -0500\n".getBytes(UTF_8)); | |||
b.write("encoding it_IT.UTF8\n".getBytes(UTF_8)); | |||
b.write("\n".getBytes(UTF_8)); | |||
b.write("message\n".getBytes(UTF_8)); | |||
RevCommit c = new RevCommit( | |||
id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); | |||
c.parseCanonical(new RevWalk(db), b.toByteArray()); | |||
assertEquals("it_IT.UTF8", c.getEncodingName()); | |||
assertEquals("message\n", c.getFullMessage()); | |||
assertEquals("message", c.getShortMessage()); | |||
assertTrue(c.getFooterLines().isEmpty()); | |||
assertEquals("au", c.getAuthorIdent().getName()); | |||
try { | |||
c.getEncoding(); | |||
fail("Expected " + UnsupportedCharsetException.class); | |||
} catch (UnsupportedCharsetException badName) { | |||
assertEquals("it_IT.UTF8", badName.getMessage()); | |||
} | |||
} | |||
@Test | |||
public void testParse_NoMessage() throws Exception { | |||
final String msg = ""; |
@@ -43,6 +43,7 @@ | |||
package org.eclipse.jgit.revwalk; | |||
import static java.nio.charset.StandardCharsets.UTF_8; | |||
import static org.junit.Assert.assertEquals; | |||
import static org.junit.Assert.assertNotNull; | |||
import static org.junit.Assert.assertNull; | |||
@@ -361,6 +362,44 @@ public class RevTagParseTest extends RepositoryTestCase { | |||
assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage()); | |||
} | |||
@Test | |||
public void testParse_illegalEncoding() throws Exception { | |||
ByteArrayOutputStream b = new ByteArrayOutputStream(); | |||
b.write("object 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8)); | |||
b.write("type tree\n".getBytes(UTF_8)); | |||
b.write("tag v1.0\n".getBytes(UTF_8)); | |||
b.write("tagger t <t@example.com> 1218123387 +0700\n".getBytes(UTF_8)); | |||
b.write("encoding utf-8logoutputencoding=gbk\n".getBytes(UTF_8)); | |||
b.write("\n".getBytes(UTF_8)); | |||
b.write("message\n".getBytes(UTF_8)); | |||
RevTag t = new RevTag(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); | |||
t.parseCanonical(new RevWalk(db), b.toByteArray()); | |||
assertEquals("t", t.getTaggerIdent().getName()); | |||
assertEquals("message", t.getShortMessage()); | |||
assertEquals("message\n", t.getFullMessage()); | |||
} | |||
@Test | |||
public void testParse_unsupportedEncoding() throws Exception { | |||
ByteArrayOutputStream b = new ByteArrayOutputStream(); | |||
b.write("object 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8)); | |||
b.write("type tree\n".getBytes(UTF_8)); | |||
b.write("tag v1.0\n".getBytes(UTF_8)); | |||
b.write("tagger t <t@example.com> 1218123387 +0700\n".getBytes(UTF_8)); | |||
b.write("encoding it_IT.UTF8\n".getBytes(UTF_8)); | |||
b.write("\n".getBytes(UTF_8)); | |||
b.write("message\n".getBytes(UTF_8)); | |||
RevTag t = new RevTag(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); | |||
t.parseCanonical(new RevWalk(db), b.toByteArray()); | |||
assertEquals("t", t.getTaggerIdent().getName()); | |||
assertEquals("message", t.getShortMessage()); | |||
assertEquals("message\n", t.getFullMessage()); | |||
} | |||
@Test | |||
public void testParse_NoMessage() throws Exception { | |||
final String msg = ""; |
@@ -44,12 +44,17 @@ | |||
package org.eclipse.jgit.revwalk; | |||
import static java.nio.charset.StandardCharsets.UTF_8; | |||
import java.io.IOException; | |||
import java.nio.charset.Charset; | |||
import java.nio.charset.IllegalCharsetNameException; | |||
import java.nio.charset.UnsupportedCharsetException; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.List; | |||
import org.eclipse.jgit.annotations.Nullable; | |||
import org.eclipse.jgit.errors.IncorrectObjectTypeException; | |||
import org.eclipse.jgit.errors.MissingObjectException; | |||
import org.eclipse.jgit.lib.AnyObjectId; | |||
@@ -441,12 +446,12 @@ public class RevCommit extends RevObject { | |||
* @return decoded commit message as a string. Never null. | |||
*/ | |||
public final String getFullMessage() { | |||
final byte[] raw = buffer; | |||
final int msgB = RawParseUtils.commitMessage(raw, 0); | |||
if (msgB < 0) | |||
byte[] raw = buffer; | |||
int msgB = RawParseUtils.commitMessage(raw, 0); | |||
if (msgB < 0) { | |||
return ""; //$NON-NLS-1$ | |||
final Charset enc = RawParseUtils.parseEncoding(raw); | |||
return RawParseUtils.decode(enc, raw, msgB, raw.length); | |||
} | |||
return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length); | |||
} | |||
/** | |||
@@ -465,16 +470,17 @@ public class RevCommit extends RevObject { | |||
* spanned multiple lines. Embedded LFs are converted to spaces. | |||
*/ | |||
public final String getShortMessage() { | |||
final byte[] raw = buffer; | |||
final int msgB = RawParseUtils.commitMessage(raw, 0); | |||
if (msgB < 0) | |||
byte[] raw = buffer; | |||
int msgB = RawParseUtils.commitMessage(raw, 0); | |||
if (msgB < 0) { | |||
return ""; //$NON-NLS-1$ | |||
} | |||
final Charset enc = RawParseUtils.parseEncoding(raw); | |||
final int msgE = RawParseUtils.endOfParagraph(raw, msgB); | |||
String str = RawParseUtils.decode(enc, raw, msgB, msgE); | |||
if (hasLF(raw, msgB, msgE)) | |||
int msgE = RawParseUtils.endOfParagraph(raw, msgB); | |||
String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE); | |||
if (hasLF(raw, msgB, msgE)) { | |||
str = StringUtils.replaceLineBreaksWithSpace(str); | |||
} | |||
return str; | |||
} | |||
@@ -485,6 +491,23 @@ public class RevCommit extends RevObject { | |||
return false; | |||
} | |||
/** | |||
* Determine the encoding of the commit message buffer. | |||
* <p> | |||
* Locates the "encoding" header (if present) and returns its value. Due to | |||
* corruption in the wild this may be an invalid encoding name that is not | |||
* recognized by any character encoding library. | |||
* <p> | |||
* If no encoding header is present, null. | |||
* | |||
* @return the preferred encoding of {@link #getRawBuffer()}; or null. | |||
* @since 4.2 | |||
*/ | |||
@Nullable | |||
public final String getEncodingName() { | |||
return RawParseUtils.parseEncodingName(buffer); | |||
} | |||
/** | |||
* Determine the encoding of the commit message buffer. | |||
* <p> | |||
@@ -492,14 +515,28 @@ public class RevCommit extends RevObject { | |||
* character set to apply to this buffer to evaluate its contents as | |||
* character data. | |||
* <p> | |||
* If no encoding header is present, {@link Constants#CHARSET} is assumed. | |||
* If no encoding header is present {@code UTF-8} is assumed. | |||
* | |||
* @return the preferred encoding of {@link #getRawBuffer()}. | |||
* @throws IllegalCharsetNameException | |||
* if the character set requested by the encoding header is | |||
* malformed and unsupportable. | |||
* @throws UnsupportedCharsetException | |||
* if the JRE does not support the character set requested by | |||
* the encoding header. | |||
*/ | |||
public final Charset getEncoding() { | |||
return RawParseUtils.parseEncoding(buffer); | |||
} | |||
private Charset guessEncoding() { | |||
try { | |||
return getEncoding(); | |||
} catch (IllegalCharsetNameException | UnsupportedCharsetException e) { | |||
return UTF_8; | |||
} | |||
} | |||
/** | |||
* Parse the footer lines (e.g. "Signed-off-by") for machine processing. | |||
* <p> | |||
@@ -529,7 +566,7 @@ public class RevCommit extends RevObject { | |||
final int msgB = RawParseUtils.commitMessage(raw, 0); | |||
final ArrayList<FooterLine> r = new ArrayList<FooterLine>(4); | |||
final Charset enc = getEncoding(); | |||
final Charset enc = guessEncoding(); | |||
for (;;) { | |||
ptr = RawParseUtils.prevLF(raw, ptr); | |||
if (ptr <= msgB) |
@@ -45,8 +45,12 @@ | |||
package org.eclipse.jgit.revwalk; | |||
import static java.nio.charset.StandardCharsets.UTF_8; | |||
import java.io.IOException; | |||
import java.nio.charset.Charset; | |||
import java.nio.charset.IllegalCharsetNameException; | |||
import java.nio.charset.UnsupportedCharsetException; | |||
import org.eclipse.jgit.errors.CorruptObjectException; | |||
import org.eclipse.jgit.errors.IncorrectObjectTypeException; | |||
@@ -162,7 +166,7 @@ public class RevTag extends RevObject { | |||
int p = pos.value += 4; // "tag " | |||
final int nameEnd = RawParseUtils.nextLF(rawTag, p) - 1; | |||
tagName = RawParseUtils.decode(Constants.CHARSET, rawTag, p, nameEnd); | |||
tagName = RawParseUtils.decode(UTF_8, rawTag, p, nameEnd); | |||
if (walk.isRetainBody()) | |||
buffer = rawTag; | |||
@@ -207,12 +211,12 @@ public class RevTag extends RevObject { | |||
* @return decoded tag message as a string. Never null. | |||
*/ | |||
public final String getFullMessage() { | |||
final byte[] raw = buffer; | |||
final int msgB = RawParseUtils.tagMessage(raw, 0); | |||
if (msgB < 0) | |||
byte[] raw = buffer; | |||
int msgB = RawParseUtils.tagMessage(raw, 0); | |||
if (msgB < 0) { | |||
return ""; //$NON-NLS-1$ | |||
final Charset enc = RawParseUtils.parseEncoding(raw); | |||
return RawParseUtils.decode(enc, raw, msgB, raw.length); | |||
} | |||
return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length); | |||
} | |||
/** | |||
@@ -231,19 +235,28 @@ public class RevTag extends RevObject { | |||
* multiple lines. Embedded LFs are converted to spaces. | |||
*/ | |||
public final String getShortMessage() { | |||
final byte[] raw = buffer; | |||
final int msgB = RawParseUtils.tagMessage(raw, 0); | |||
if (msgB < 0) | |||
byte[] raw = buffer; | |||
int msgB = RawParseUtils.tagMessage(raw, 0); | |||
if (msgB < 0) { | |||
return ""; //$NON-NLS-1$ | |||
} | |||
final Charset enc = RawParseUtils.parseEncoding(raw); | |||
final int msgE = RawParseUtils.endOfParagraph(raw, msgB); | |||
String str = RawParseUtils.decode(enc, raw, msgB, msgE); | |||
if (RevCommit.hasLF(raw, msgB, msgE)) | |||
int msgE = RawParseUtils.endOfParagraph(raw, msgB); | |||
String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE); | |||
if (RevCommit.hasLF(raw, msgB, msgE)) { | |||
str = StringUtils.replaceLineBreaksWithSpace(str); | |||
} | |||
return str; | |||
} | |||
private Charset guessEncoding() { | |||
try { | |||
return RawParseUtils.parseEncoding(buffer); | |||
} catch (IllegalCharsetNameException | UnsupportedCharsetException e) { | |||
return UTF_8; | |||
} | |||
} | |||
/** | |||
* Get a reference to the object this tag was placed on. | |||
* <p> |
@@ -44,6 +44,8 @@ | |||
package org.eclipse.jgit.util; | |||
import static java.nio.charset.StandardCharsets.ISO_8859_1; | |||
import static java.nio.charset.StandardCharsets.UTF_8; | |||
import static org.eclipse.jgit.lib.ObjectChecker.author; | |||
import static org.eclipse.jgit.lib.ObjectChecker.committer; | |||
import static org.eclipse.jgit.lib.ObjectChecker.encoding; | |||
@@ -60,6 +62,7 @@ import java.util.Arrays; | |||
import java.util.HashMap; | |||
import java.util.Map; | |||
import org.eclipse.jgit.annotations.Nullable; | |||
import org.eclipse.jgit.lib.Constants; | |||
import org.eclipse.jgit.lib.PersonIdent; | |||
@@ -70,7 +73,7 @@ public final class RawParseUtils { | |||
* | |||
* @since 2.2 | |||
*/ | |||
public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$ | |||
public static final Charset UTF8_CHARSET = UTF_8; | |||
private static final byte[] digits10; | |||
@@ -81,9 +84,9 @@ public final class RawParseUtils { | |||
private static final Map<String, Charset> encodingAliases; | |||
static { | |||
encodingAliases = new HashMap<String, Charset>(); | |||
encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$ | |||
encodingAliases.put("'utf8'", Charset.forName("UTF-8")); //$NON-NLS-1$ //$NON-NLS-2$ | |||
encodingAliases = new HashMap<>(); | |||
encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$ | |||
encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$ | |||
digits10 = new byte['9' + 1]; | |||
Arrays.fill(digits10, (byte) -1); | |||
@@ -671,6 +674,27 @@ public final class RawParseUtils { | |||
return match(b, ptr, encoding); | |||
} | |||
/** | |||
* Parse the "encoding " header as a string. | |||
* <p> | |||
* Locates the "encoding " header (if present) and returns its value. | |||
* | |||
* @param b | |||
* buffer to scan. | |||
* @return the encoding header as specified in the commit; null if the | |||
* header was not present and should be assumed. | |||
* @since 4.2 | |||
*/ | |||
@Nullable | |||
public static String parseEncodingName(final byte[] b) { | |||
int enc = encoding(b, 0); | |||
if (enc < 0) { | |||
return null; | |||
} | |||
int lf = nextLF(b, enc); | |||
return decode(UTF_8, b, enc, lf - 1); | |||
} | |||
/** | |||
* Parse the "encoding " header into a character set reference. | |||
* <p> | |||
@@ -678,29 +702,33 @@ public final class RawParseUtils { | |||
* {@link #encoding(byte[], int)} and then returns the proper character set | |||
* to apply to this buffer to evaluate its contents as character data. | |||
* <p> | |||
* If no encoding header is present, {@link Constants#CHARSET} is assumed. | |||
* If no encoding header is present {@code UTF-8} is assumed. | |||
* | |||
* @param b | |||
* buffer to scan. | |||
* @return the Java character set representation. Never null. | |||
* @throws IllegalCharsetNameException | |||
* if the character set requested by the encoding header is | |||
* malformed and unsupportable. | |||
* @throws UnsupportedCharsetException | |||
* if the JRE does not support the character set requested by | |||
* the encoding header. | |||
*/ | |||
public static Charset parseEncoding(final byte[] b) { | |||
final int enc = encoding(b, 0); | |||
if (enc < 0) | |||
return Constants.CHARSET; | |||
final int lf = nextLF(b, enc); | |||
String decoded = decode(Constants.CHARSET, b, enc, lf - 1); | |||
String enc = parseEncodingName(b); | |||
if (enc == null) { | |||
return UTF_8; | |||
} | |||
String name = enc.trim(); | |||
try { | |||
return Charset.forName(decoded); | |||
} catch (IllegalCharsetNameException badName) { | |||
Charset aliased = charsetForAlias(decoded); | |||
if (aliased != null) | |||
return aliased; | |||
throw badName; | |||
} catch (UnsupportedCharsetException badName) { | |||
Charset aliased = charsetForAlias(decoded); | |||
if (aliased != null) | |||
return Charset.forName(name); | |||
} catch (IllegalCharsetNameException | |||
| UnsupportedCharsetException badName) { | |||
Charset aliased = charsetForAlias(name); | |||
if (aliased != null) { | |||
return aliased; | |||
} | |||
throw badName; | |||
} | |||
} | |||
@@ -739,7 +767,15 @@ public final class RawParseUtils { | |||
* parsed. | |||
*/ | |||
public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) { | |||
final Charset cs = parseEncoding(raw); | |||
Charset cs; | |||
try { | |||
cs = parseEncoding(raw); | |||
} catch (IllegalCharsetNameException | UnsupportedCharsetException e) { | |||
// Assume UTF-8 for person identities, usually this is correct. | |||
// If not decode() will fall back to the ISO-8859-1 encoding. | |||
cs = UTF_8; | |||
} | |||
final int emailB = nextLF(raw, nameB, '<'); | |||
final int emailE = nextLF(raw, emailB, '>'); | |||
if (emailB >= raw.length || raw[emailB] == '\n' || | |||
@@ -887,7 +923,7 @@ public final class RawParseUtils { | |||
*/ | |||
public static String decode(final byte[] buffer, final int start, | |||
final int end) { | |||
return decode(Constants.CHARSET, buffer, start, end); | |||
return decode(UTF_8, buffer, start, end); | |||
} | |||
/** | |||
@@ -961,23 +997,21 @@ public final class RawParseUtils { | |||
public static String decodeNoFallback(final Charset cs, | |||
final byte[] buffer, final int start, final int end) | |||
throws CharacterCodingException { | |||
final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); | |||
ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); | |||
b.mark(); | |||
// Try our built-in favorite. The assumption here is that | |||
// decoding will fail if the data is not actually encoded | |||
// using that encoder. | |||
// | |||
try { | |||
return decode(b, Constants.CHARSET); | |||
return decode(b, UTF_8); | |||
} catch (CharacterCodingException e) { | |||
b.reset(); | |||
} | |||
if (!cs.equals(Constants.CHARSET)) { | |||
if (!cs.equals(UTF_8)) { | |||
// Try the suggested encoding, it might be right since it was | |||
// provided by the caller. | |||
// | |||
try { | |||
return decode(b, cs); | |||
} catch (CharacterCodingException e) { | |||
@@ -987,9 +1021,8 @@ public final class RawParseUtils { | |||
// Try the default character set. A small group of people | |||
// might actually use the same (or very similar) locale. | |||
// | |||
final Charset defcs = Charset.defaultCharset(); | |||
if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) { | |||
Charset defcs = Charset.defaultCharset(); | |||
if (!defcs.equals(cs) && !defcs.equals(UTF_8)) { | |||
try { | |||
return decode(b, defcs); | |||
} catch (CharacterCodingException e) { |