From ae9e157ef4e6a3708489725d4436cc15d273308f Mon Sep 17 00:00:00 2001 From: James Moger Date: Thu, 7 Jun 2012 17:30:18 -0400 Subject: Try multiple encodings when working with string blobs (issue 97) --- src/com/gitblit/GitBlit.java | 9 ++++++ src/com/gitblit/LuceneExecutor.java | 3 +- src/com/gitblit/PagesServlet.java | 8 +++-- src/com/gitblit/utils/JGitUtils.java | 11 ++++--- src/com/gitblit/utils/StringUtils.java | 42 ++++++++++++++++++++++++++ src/com/gitblit/wicket/pages/BlobPage.java | 11 ++++--- src/com/gitblit/wicket/pages/MarkdownPage.java | 6 ++-- src/com/gitblit/wicket/pages/RawPage.java | 9 +++--- src/com/gitblit/wicket/pages/SummaryPage.java | 3 +- 9 files changed, 81 insertions(+), 21 deletions(-) (limited to 'src/com/gitblit') diff --git a/src/com/gitblit/GitBlit.java b/src/com/gitblit/GitBlit.java index f96340ae..dc53540e 100644 --- a/src/com/gitblit/GitBlit.java +++ b/src/com/gitblit/GitBlit.java @@ -189,6 +189,15 @@ public class GitBlit implements ServletContextListener { return self().timezone; } + /** + * Returns the user-defined blob encodings. + * + * @return an array of encodings, may be empty + */ + public static String [] getEncodings() { + return getStrings(Keys.web.blobEncodings).toArray(new String[0]); + } + /** * Returns the boolean value for the specified key. If the key does not diff --git a/src/com/gitblit/LuceneExecutor.java b/src/com/gitblit/LuceneExecutor.java index afd1cc5a..b3165434 100644 --- a/src/com/gitblit/LuceneExecutor.java +++ b/src/com/gitblit/LuceneExecutor.java @@ -642,6 +642,7 @@ public class LuceneExecutor implements Runnable { String branch, RevCommit commit) { IndexResult result = new IndexResult(); try { + String [] encodings = storedSettings.getStrings(Keys.web.blobEncodings).toArray(new String[0]); List changedPaths = JGitUtils.getFilesInCommit(repository, commit); String revDate = DateTools.timeToString(commit.getCommitTime() * 1000L, Resolution.MINUTE); @@ -674,7 +675,7 @@ public class LuceneExecutor implements Runnable { if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) { // read the blob content String str = JGitUtils.getStringContent(repository, commit.getTree(), - path.path); + path.path, encodings); doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED)); writer.addDocument(doc); } diff --git a/src/com/gitblit/PagesServlet.java b/src/com/gitblit/PagesServlet.java index d6304f70..ad9276b4 100644 --- a/src/com/gitblit/PagesServlet.java +++ b/src/com/gitblit/PagesServlet.java @@ -141,13 +141,15 @@ public class PagesServlet extends HttpServlet { } response.setDateHeader("Last-Modified", JGitUtils.getCommitDate(commit).getTime()); + String [] encodings = GitBlit.getEncodings(); + RevTree tree = commit.getTree(); byte[] content = null; if (StringUtils.isEmpty(resource)) { // find resource String[] files = { "index.html", "index.htm", "index.mkd" }; for (String file : files) { - content = JGitUtils.getStringContent(r, tree, file) + content = JGitUtils.getStringContent(r, tree, file, encodings) .getBytes(Constants.ENCODING); if (content != null) { resource = file; @@ -165,7 +167,7 @@ public class PagesServlet extends HttpServlet { contentType = "text/plain"; } if (contentType.startsWith("text")) { - content = JGitUtils.getStringContent(r, tree, resource).getBytes( + content = JGitUtils.getStringContent(r, tree, resource, encodings).getBytes( Constants.ENCODING); } else { content = JGitUtils.getByteContent(r, tree, resource); @@ -177,7 +179,7 @@ public class PagesServlet extends HttpServlet { // no content, try custom 404 page if (ArrayUtils.isEmpty(content)) { - String custom404 = JGitUtils.getStringContent(r, tree, "404.html"); + String custom404 = JGitUtils.getStringContent(r, tree, "404.html", encodings); if (!StringUtils.isEmpty(custom404)) { content = custom404.getBytes(Constants.ENCODING); } diff --git a/src/com/gitblit/utils/JGitUtils.java b/src/com/gitblit/utils/JGitUtils.java index f5ca5efd..72a8ab3c 100644 --- a/src/com/gitblit/utils/JGitUtils.java +++ b/src/com/gitblit/utils/JGitUtils.java @@ -20,7 +20,6 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.nio.charset.Charset; import java.text.MessageFormat; import java.util.ArrayList; import java.util.Arrays; @@ -543,14 +542,15 @@ public class JGitUtils { * @param tree * if null, the RevTree from HEAD is assumed. * @param blobPath + * @param charsets optional * @return UTF-8 string content */ - public static String getStringContent(Repository repository, RevTree tree, String blobPath) { + public static String getStringContent(Repository repository, RevTree tree, String blobPath, String... charsets) { byte[] content = getByteContent(repository, tree, blobPath); if (content == null) { return null; } - return new String(content, Charset.forName(Constants.CHARACTER_ENCODING)); + return StringUtils.decodeString(content, charsets); } /** @@ -589,14 +589,15 @@ public class JGitUtils { * * @param repository * @param objectId + * @param charsets optional * @return UTF-8 string content */ - public static String getStringContent(Repository repository, String objectId) { + public static String getStringContent(Repository repository, String objectId, String... charsets) { byte[] content = getByteContent(repository, objectId); if (content == null) { return null; } - return new String(content, Charset.forName(Constants.CHARACTER_ENCODING)); + return StringUtils.decodeString(content, charsets); } /** diff --git a/src/com/gitblit/utils/StringUtils.java b/src/com/gitblit/utils/StringUtils.java index 2c357241..baed5f0c 100644 --- a/src/com/gitblit/utils/StringUtils.java +++ b/src/com/gitblit/utils/StringUtils.java @@ -16,13 +16,23 @@ package com.gitblit.utils; import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; +import java.util.LinkedHashSet; import java.util.List; +import java.util.Set; import java.util.regex.PatternSyntaxException; /** @@ -550,4 +560,36 @@ public class StringUtils { // remember to append any characters to the right of a match return sb.toString(); } + + /** + * Decodes a string by trying several charsets until one does not throw a + * coding exception. Last resort is to interpret as UTF-8 with illegal + * character substitution. + * + * @param content + * @param charsets optional + * @return a string + */ + public static String decodeString(byte [] content, String... charsets) { + Set sets = new LinkedHashSet(); + if (!ArrayUtils.isEmpty(charsets)) { + sets.addAll(Arrays.asList(charsets)); + } + sets.addAll(Arrays.asList("UTF-8", "ISO-8859-1", Charset.defaultCharset().name())); + for (String charset : sets) { + try { + Charset cs = Charset.forName(charset); + CharsetDecoder decoder = cs.newDecoder(); + CharBuffer buffer = decoder.decode(ByteBuffer.wrap(content)); + return buffer.toString(); + } catch (CharacterCodingException e) { + // ignore and advance to the next charset + } catch (IllegalCharsetNameException e) { + // ignore illegal charset names + } catch (UnsupportedCharsetException e) { + // ignore unsupported charsets + } + } + return new String(content, Charset.forName("UTF-8")); + } } \ No newline at end of file diff --git a/src/com/gitblit/wicket/pages/BlobPage.java b/src/com/gitblit/wicket/pages/BlobPage.java index 1c438370..fb5a962b 100644 --- a/src/com/gitblit/wicket/pages/BlobPage.java +++ b/src/com/gitblit/wicket/pages/BlobPage.java @@ -41,7 +41,8 @@ public class BlobPage extends RepositoryPage { Repository r = getRepository(); final String blobPath = WicketUtils.getPath(params); - + String [] encodings = GitBlit.getEncodings(); + if (StringUtils.isEmpty(blobPath)) { // blob by objectid @@ -54,7 +55,7 @@ public class BlobPage extends RepositoryPage { add(new BookmarkablePageLink("headLink", BlobPage.class).setEnabled(false)); add(new CommitHeaderPanel("commitHeader", objectId)); add(new PathBreadcrumbsPanel("breadcrumbs", repositoryName, blobPath, objectId)); - Component c = new Label("blobText", JGitUtils.getStringContent(r, objectId)); + Component c = new Label("blobText", JGitUtils.getStringContent(r, objectId, encodings)); WicketUtils.setCssClass(c, "plainprint"); add(c); } else { @@ -111,7 +112,7 @@ public class BlobPage extends RepositoryPage { case 1: // PrettyPrint blob text c = new Label("blobText", JGitUtils.getStringContent(r, commit.getTree(), - blobPath)); + blobPath, encodings)); WicketUtils.setCssClass(c, "prettyprint linenums"); break; case 2: @@ -125,14 +126,14 @@ public class BlobPage extends RepositoryPage { default: // plain text c = new Label("blobText", JGitUtils.getStringContent(r, commit.getTree(), - blobPath)); + blobPath, encodings)); WicketUtils.setCssClass(c, "plainprint"); } add(c); } else { // plain text Label blobLabel = new Label("blobText", JGitUtils.getStringContent(r, - commit.getTree(), blobPath)); + commit.getTree(), blobPath, encodings)); WicketUtils.setCssClass(blobLabel, "plainprint"); add(blobLabel); } diff --git a/src/com/gitblit/wicket/pages/MarkdownPage.java b/src/com/gitblit/wicket/pages/MarkdownPage.java index aaf12bad..5764235a 100644 --- a/src/com/gitblit/wicket/pages/MarkdownPage.java +++ b/src/com/gitblit/wicket/pages/MarkdownPage.java @@ -24,6 +24,7 @@ import org.eclipse.jgit.lib.Constants; import org.eclipse.jgit.lib.Repository; import org.eclipse.jgit.revwalk.RevCommit; +import com.gitblit.GitBlit; import com.gitblit.utils.JGitUtils; import com.gitblit.utils.MarkdownUtils; import com.gitblit.wicket.WicketUtils; @@ -37,7 +38,8 @@ public class MarkdownPage extends RepositoryPage { Repository r = getRepository(); RevCommit commit = JGitUtils.getCommit(r, objectId); - + String [] encodings = GitBlit.getEncodings(); + // markdown page links add(new BookmarkablePageLink("blameLink", BlamePage.class, WicketUtils.newPathParameter(repositoryName, objectId, markdownPath))); @@ -49,7 +51,7 @@ public class MarkdownPage extends RepositoryPage { WicketUtils.newPathParameter(repositoryName, Constants.HEAD, markdownPath))); // Read raw markdown content and transform it to html - String markdownText = JGitUtils.getStringContent(r, commit.getTree(), markdownPath); + String markdownText = JGitUtils.getStringContent(r, commit.getTree(), markdownPath, encodings); String htmlText; try { htmlText = MarkdownUtils.transformMarkdown(markdownText); diff --git a/src/com/gitblit/wicket/pages/RawPage.java b/src/com/gitblit/wicket/pages/RawPage.java index f71d986f..00cc5bf7 100644 --- a/src/com/gitblit/wicket/pages/RawPage.java +++ b/src/com/gitblit/wicket/pages/RawPage.java @@ -43,7 +43,8 @@ public class RawPage extends WebPage { final String repositoryName = WicketUtils.getRepositoryName(params); final String objectId = WicketUtils.getObject(params); final String blobPath = WicketUtils.getPath(params); - + String [] encodings = GitBlit.getEncodings(); + Repository r = GitBlit.self().getRepository(repositoryName); if (r == null) { error(getString("gb.canNotLoadRepository") + " " + repositoryName); @@ -53,7 +54,7 @@ public class RawPage extends WebPage { if (StringUtils.isEmpty(blobPath)) { // objectid referenced raw view - Label blobLabel = new Label("rawText", JGitUtils.getStringContent(r, objectId)); + Label blobLabel = new Label("rawText", JGitUtils.getStringContent(r, objectId, encodings)); WicketUtils.setCssClass(blobLabel, "plainprint"); add(blobLabel); } else { @@ -92,14 +93,14 @@ public class RawPage extends WebPage { default: // plain text c = new Label("rawText", JGitUtils.getStringContent(r, commit.getTree(), - blobPath)); + blobPath, encodings)); WicketUtils.setCssClass(c, "plainprint"); } add(c); } else { // plain text Label blobLabel = new Label("rawText", JGitUtils.getStringContent(r, - commit.getTree(), blobPath)); + commit.getTree(), blobPath, encodings)); WicketUtils.setCssClass(blobLabel, "plainprint"); add(blobLabel); } diff --git a/src/com/gitblit/wicket/pages/SummaryPage.java b/src/com/gitblit/wicket/pages/SummaryPage.java index 2996b662..8e145c8d 100644 --- a/src/com/gitblit/wicket/pages/SummaryPage.java +++ b/src/com/gitblit/wicket/pages/SummaryPage.java @@ -158,7 +158,8 @@ public class SummaryPage extends RepositoryPage { } } if (!StringUtils.isEmpty(readme)) { - String markdownText = JGitUtils.getStringContent(r, head.getTree(), readme); + String [] encodings = GitBlit.getEncodings(); + String markdownText = JGitUtils.getStringContent(r, head.getTree(), readme, encodings); htmlText = MarkdownUtils.transformMarkdown(markdownText); } } catch (ParseException p) { -- cgit v1.2.3