From ae9e157ef4e6a3708489725d4436cc15d273308f Mon Sep 17 00:00:00 2001 From: James Moger Date: Thu, 7 Jun 2012 17:30:18 -0400 Subject: [PATCH] Try multiple encodings when working with string blobs (issue 97) --- distrib/gitblit.properties | 11 +++++ docs/04_releases.mkd | 2 + src/com/gitblit/GitBlit.java | 9 ++++ src/com/gitblit/LuceneExecutor.java | 3 +- src/com/gitblit/PagesServlet.java | 8 ++-- src/com/gitblit/utils/JGitUtils.java | 11 ++--- src/com/gitblit/utils/StringUtils.java | 42 +++++++++++++++++++ src/com/gitblit/wicket/pages/BlobPage.java | 11 ++--- .../gitblit/wicket/pages/MarkdownPage.java | 6 ++- src/com/gitblit/wicket/pages/RawPage.java | 9 ++-- src/com/gitblit/wicket/pages/SummaryPage.java | 3 +- tests/com/gitblit/tests/JGitUtilsTest.java | 3 +- 12 files changed, 96 insertions(+), 22 deletions(-) diff --git a/distrib/gitblit.properties b/distrib/gitblit.properties index 5292a91e..8e768262 100644 --- a/distrib/gitblit.properties +++ b/distrib/gitblit.properties @@ -362,6 +362,16 @@ web.loginMessage = gitblit # SINCE 0.5.0 web.repositoriesMessage = gitblit +# Ordered list of charsets/encodings to use when trying to display a blob. +# If empty, UTF-8 and ISO-8859-1 are used. The server's default charset +# is always appended to the encoding list. If all encodings fail to cleanly +# decode the blob content, UTF-8 will be used with the standard malformed +# input/unmappable character replacement strings. +# +# SPACE-DELIMITED +# SINCE 1.0.0 +web.blobEncodings = UTF-8 ISO-8859-1 + # Manually set the default timezone to be used by Gitblit for display in the # web ui. This value is independent of the JVM timezone. Specifying a blank # value will default to the JVM timezone. @@ -432,6 +442,7 @@ web.forwardSlashCharacter = / # e.g. # web.otherUrls = ssh://localhost/git/{0} git://localhost/git/{0} # +# SPACE-DELIMITED # SINCE 0.5.0 web.otherUrls = diff --git a/docs/04_releases.mkd b/docs/04_releases.mkd index d20000bf..00981978 100644 --- a/docs/04_releases.mkd +++ b/docs/04_releases.mkd @@ -16,6 +16,8 @@ #### additions +- Added setting to control charsets for blob string decoding. Default encodings are UTF-8, ISO-8859-1, and server's default charset. (issue 97) + **New:** *web.blobEncodings = UTF-8 ISO-8859-1* - Exposed JGit's internal configuration settings in gitblit.properties/web.xml (issue 93) **New:** *git.packedGitWindowSize = 8k* **New:** *git.packedGitLimit = 10m* diff --git a/src/com/gitblit/GitBlit.java b/src/com/gitblit/GitBlit.java index f96340ae..dc53540e 100644 --- a/src/com/gitblit/GitBlit.java +++ b/src/com/gitblit/GitBlit.java @@ -189,6 +189,15 @@ public class GitBlit implements ServletContextListener { return self().timezone; } + /** + * Returns the user-defined blob encodings. + * + * @return an array of encodings, may be empty + */ + public static String [] getEncodings() { + return getStrings(Keys.web.blobEncodings).toArray(new String[0]); + } + /** * Returns the boolean value for the specified key. If the key does not diff --git a/src/com/gitblit/LuceneExecutor.java b/src/com/gitblit/LuceneExecutor.java index afd1cc5a..b3165434 100644 --- a/src/com/gitblit/LuceneExecutor.java +++ b/src/com/gitblit/LuceneExecutor.java @@ -642,6 +642,7 @@ public class LuceneExecutor implements Runnable { String branch, RevCommit commit) { IndexResult result = new IndexResult(); try { + String [] encodings = storedSettings.getStrings(Keys.web.blobEncodings).toArray(new String[0]); List changedPaths = JGitUtils.getFilesInCommit(repository, commit); String revDate = DateTools.timeToString(commit.getCommitTime() * 1000L, Resolution.MINUTE); @@ -674,7 +675,7 @@ public class LuceneExecutor implements Runnable { if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) { // read the blob content String str = JGitUtils.getStringContent(repository, commit.getTree(), - path.path); + path.path, encodings); doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED)); writer.addDocument(doc); } diff --git a/src/com/gitblit/PagesServlet.java b/src/com/gitblit/PagesServlet.java index d6304f70..ad9276b4 100644 --- a/src/com/gitblit/PagesServlet.java +++ b/src/com/gitblit/PagesServlet.java @@ -141,13 +141,15 @@ public class PagesServlet extends HttpServlet { } response.setDateHeader("Last-Modified", JGitUtils.getCommitDate(commit).getTime()); + String [] encodings = GitBlit.getEncodings(); + RevTree tree = commit.getTree(); byte[] content = null; if (StringUtils.isEmpty(resource)) { // find resource String[] files = { "index.html", "index.htm", "index.mkd" }; for (String file : files) { - content = JGitUtils.getStringContent(r, tree, file) + content = JGitUtils.getStringContent(r, tree, file, encodings) .getBytes(Constants.ENCODING); if (content != null) { resource = file; @@ -165,7 +167,7 @@ public class PagesServlet extends HttpServlet { contentType = "text/plain"; } if (contentType.startsWith("text")) { - content = JGitUtils.getStringContent(r, tree, resource).getBytes( + content = JGitUtils.getStringContent(r, tree, resource, encodings).getBytes( Constants.ENCODING); } else { content = JGitUtils.getByteContent(r, tree, resource); @@ -177,7 +179,7 @@ public class PagesServlet extends HttpServlet { // no content, try custom 404 page if (ArrayUtils.isEmpty(content)) { - String custom404 = JGitUtils.getStringContent(r, tree, "404.html"); + String custom404 = JGitUtils.getStringContent(r, tree, "404.html", encodings); if (!StringUtils.isEmpty(custom404)) { content = custom404.getBytes(Constants.ENCODING); } diff --git a/src/com/gitblit/utils/JGitUtils.java b/src/com/gitblit/utils/JGitUtils.java index f5ca5efd..72a8ab3c 100644 --- a/src/com/gitblit/utils/JGitUtils.java +++ b/src/com/gitblit/utils/JGitUtils.java @@ -20,7 +20,6 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.nio.charset.Charset; import java.text.MessageFormat; import java.util.ArrayList; import java.util.Arrays; @@ -543,14 +542,15 @@ public class JGitUtils { * @param tree * if null, the RevTree from HEAD is assumed. * @param blobPath + * @param charsets optional * @return UTF-8 string content */ - public static String getStringContent(Repository repository, RevTree tree, String blobPath) { + public static String getStringContent(Repository repository, RevTree tree, String blobPath, String... charsets) { byte[] content = getByteContent(repository, tree, blobPath); if (content == null) { return null; } - return new String(content, Charset.forName(Constants.CHARACTER_ENCODING)); + return StringUtils.decodeString(content, charsets); } /** @@ -589,14 +589,15 @@ public class JGitUtils { * * @param repository * @param objectId + * @param charsets optional * @return UTF-8 string content */ - public static String getStringContent(Repository repository, String objectId) { + public static String getStringContent(Repository repository, String objectId, String... charsets) { byte[] content = getByteContent(repository, objectId); if (content == null) { return null; } - return new String(content, Charset.forName(Constants.CHARACTER_ENCODING)); + return StringUtils.decodeString(content, charsets); } /** diff --git a/src/com/gitblit/utils/StringUtils.java b/src/com/gitblit/utils/StringUtils.java index 2c357241..baed5f0c 100644 --- a/src/com/gitblit/utils/StringUtils.java +++ b/src/com/gitblit/utils/StringUtils.java @@ -16,13 +16,23 @@ package com.gitblit.utils; import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; +import java.util.LinkedHashSet; import java.util.List; +import java.util.Set; import java.util.regex.PatternSyntaxException; /** @@ -550,4 +560,36 @@ public class StringUtils { // remember to append any characters to the right of a match return sb.toString(); } + + /** + * Decodes a string by trying several charsets until one does not throw a + * coding exception. Last resort is to interpret as UTF-8 with illegal + * character substitution. + * + * @param content + * @param charsets optional + * @return a string + */ + public static String decodeString(byte [] content, String... charsets) { + Set sets = new LinkedHashSet(); + if (!ArrayUtils.isEmpty(charsets)) { + sets.addAll(Arrays.asList(charsets)); + } + sets.addAll(Arrays.asList("UTF-8", "ISO-8859-1", Charset.defaultCharset().name())); + for (String charset : sets) { + try { + Charset cs = Charset.forName(charset); + CharsetDecoder decoder = cs.newDecoder(); + CharBuffer buffer = decoder.decode(ByteBuffer.wrap(content)); + return buffer.toString(); + } catch (CharacterCodingException e) { + // ignore and advance to the next charset + } catch (IllegalCharsetNameException e) { + // ignore illegal charset names + } catch (UnsupportedCharsetException e) { + // ignore unsupported charsets + } + } + return new String(content, Charset.forName("UTF-8")); + } } \ No newline at end of file diff --git a/src/com/gitblit/wicket/pages/BlobPage.java b/src/com/gitblit/wicket/pages/BlobPage.java index 1c438370..fb5a962b 100644 --- a/src/com/gitblit/wicket/pages/BlobPage.java +++ b/src/com/gitblit/wicket/pages/BlobPage.java @@ -41,7 +41,8 @@ public class BlobPage extends RepositoryPage { Repository r = getRepository(); final String blobPath = WicketUtils.getPath(params); - + String [] encodings = GitBlit.getEncodings(); + if (StringUtils.isEmpty(blobPath)) { // blob by objectid @@ -54,7 +55,7 @@ public class BlobPage extends RepositoryPage { add(new BookmarkablePageLink("headLink", BlobPage.class).setEnabled(false)); add(new CommitHeaderPanel("commitHeader", objectId)); add(new PathBreadcrumbsPanel("breadcrumbs", repositoryName, blobPath, objectId)); - Component c = new Label("blobText", JGitUtils.getStringContent(r, objectId)); + Component c = new Label("blobText", JGitUtils.getStringContent(r, objectId, encodings)); WicketUtils.setCssClass(c, "plainprint"); add(c); } else { @@ -111,7 +112,7 @@ public class BlobPage extends RepositoryPage { case 1: // PrettyPrint blob text c = new Label("blobText", JGitUtils.getStringContent(r, commit.getTree(), - blobPath)); + blobPath, encodings)); WicketUtils.setCssClass(c, "prettyprint linenums"); break; case 2: @@ -125,14 +126,14 @@ public class BlobPage extends RepositoryPage { default: // plain text c = new Label("blobText", JGitUtils.getStringContent(r, commit.getTree(), - blobPath)); + blobPath, encodings)); WicketUtils.setCssClass(c, "plainprint"); } add(c); } else { // plain text Label blobLabel = new Label("blobText", JGitUtils.getStringContent(r, - commit.getTree(), blobPath)); + commit.getTree(), blobPath, encodings)); WicketUtils.setCssClass(blobLabel, "plainprint"); add(blobLabel); } diff --git a/src/com/gitblit/wicket/pages/MarkdownPage.java b/src/com/gitblit/wicket/pages/MarkdownPage.java index aaf12bad..5764235a 100644 --- a/src/com/gitblit/wicket/pages/MarkdownPage.java +++ b/src/com/gitblit/wicket/pages/MarkdownPage.java @@ -24,6 +24,7 @@ import org.eclipse.jgit.lib.Constants; import org.eclipse.jgit.lib.Repository; import org.eclipse.jgit.revwalk.RevCommit; +import com.gitblit.GitBlit; import com.gitblit.utils.JGitUtils; import com.gitblit.utils.MarkdownUtils; import com.gitblit.wicket.WicketUtils; @@ -37,7 +38,8 @@ public class MarkdownPage extends RepositoryPage { Repository r = getRepository(); RevCommit commit = JGitUtils.getCommit(r, objectId); - + String [] encodings = GitBlit.getEncodings(); + // markdown page links add(new BookmarkablePageLink("blameLink", BlamePage.class, WicketUtils.newPathParameter(repositoryName, objectId, markdownPath))); @@ -49,7 +51,7 @@ public class MarkdownPage extends RepositoryPage { WicketUtils.newPathParameter(repositoryName, Constants.HEAD, markdownPath))); // Read raw markdown content and transform it to html - String markdownText = JGitUtils.getStringContent(r, commit.getTree(), markdownPath); + String markdownText = JGitUtils.getStringContent(r, commit.getTree(), markdownPath, encodings); String htmlText; try { htmlText = MarkdownUtils.transformMarkdown(markdownText); diff --git a/src/com/gitblit/wicket/pages/RawPage.java b/src/com/gitblit/wicket/pages/RawPage.java index f71d986f..00cc5bf7 100644 --- a/src/com/gitblit/wicket/pages/RawPage.java +++ b/src/com/gitblit/wicket/pages/RawPage.java @@ -43,7 +43,8 @@ public class RawPage extends WebPage { final String repositoryName = WicketUtils.getRepositoryName(params); final String objectId = WicketUtils.getObject(params); final String blobPath = WicketUtils.getPath(params); - + String [] encodings = GitBlit.getEncodings(); + Repository r = GitBlit.self().getRepository(repositoryName); if (r == null) { error(getString("gb.canNotLoadRepository") + " " + repositoryName); @@ -53,7 +54,7 @@ public class RawPage extends WebPage { if (StringUtils.isEmpty(blobPath)) { // objectid referenced raw view - Label blobLabel = new Label("rawText", JGitUtils.getStringContent(r, objectId)); + Label blobLabel = new Label("rawText", JGitUtils.getStringContent(r, objectId, encodings)); WicketUtils.setCssClass(blobLabel, "plainprint"); add(blobLabel); } else { @@ -92,14 +93,14 @@ public class RawPage extends WebPage { default: // plain text c = new Label("rawText", JGitUtils.getStringContent(r, commit.getTree(), - blobPath)); + blobPath, encodings)); WicketUtils.setCssClass(c, "plainprint"); } add(c); } else { // plain text Label blobLabel = new Label("rawText", JGitUtils.getStringContent(r, - commit.getTree(), blobPath)); + commit.getTree(), blobPath, encodings)); WicketUtils.setCssClass(blobLabel, "plainprint"); add(blobLabel); } diff --git a/src/com/gitblit/wicket/pages/SummaryPage.java b/src/com/gitblit/wicket/pages/SummaryPage.java index 2996b662..8e145c8d 100644 --- a/src/com/gitblit/wicket/pages/SummaryPage.java +++ b/src/com/gitblit/wicket/pages/SummaryPage.java @@ -158,7 +158,8 @@ public class SummaryPage extends RepositoryPage { } } if (!StringUtils.isEmpty(readme)) { - String markdownText = JGitUtils.getStringContent(r, head.getTree(), readme); + String [] encodings = GitBlit.getEncodings(); + String markdownText = JGitUtils.getStringContent(r, head.getTree(), readme, encodings); htmlText = MarkdownUtils.transformMarkdown(markdownText); } } catch (ParseException p) { diff --git a/tests/com/gitblit/tests/JGitUtilsTest.java b/tests/com/gitblit/tests/JGitUtilsTest.java index 616ea837..dc4d3c50 100644 --- a/tests/com/gitblit/tests/JGitUtilsTest.java +++ b/tests/com/gitblit/tests/JGitUtilsTest.java @@ -37,6 +37,7 @@ import org.eclipse.jgit.lib.PersonIdent; import org.eclipse.jgit.lib.Repository; import org.eclipse.jgit.lib.RepositoryCache.FileKey; import org.eclipse.jgit.revwalk.RevCommit; +import org.eclipse.jgit.revwalk.RevTree; import org.eclipse.jgit.util.FS; import org.eclipse.jgit.util.FileUtils; import org.junit.Test; @@ -265,7 +266,7 @@ public class JGitUtilsTest { @Test public void testStringContent() throws Exception { Repository repository = GitBlitSuite.getHelloworldRepository(); - String contentA = JGitUtils.getStringContent(repository, null, "java.java"); + String contentA = JGitUtils.getStringContent(repository, (RevTree) null, "java.java"); RevCommit commit = JGitUtils.getCommit(repository, Constants.HEAD); String contentB = JGitUtils.getStringContent(repository, commit.getTree(), "java.java"); String contentC = JGitUtils.getStringContent(repository, commit.getTree(), "missing.txt"); -- 2.39.5