Bug 60677 -- handle multibyte encodings correctly in HwmfGraphics' drawString. Thank...

author Tim Allison <tallison@apache.org>

Fri, 3 Feb 2017 20:19:33 +0000 (20:19 +0000)

committer Tim Allison <tallison@apache.org>

Fri, 3 Feb 2017 20:19:33 +0000 (20:19 +0000)
author Tim Allison <tallison@apache.org>
Fri, 3 Feb 2017 20:19:33 +0000 (20:19 +0000)
committer Tim Allison <tallison@apache.org>
Fri, 3 Feb 2017 20:19:33 +0000 (20:19 +0000)
diff --git a/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java b/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java

index f292313185b672259821efeb487bb14b0d133576..ef5af49e40f26893d63bd132323f3c199bbae8bb 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java
+++ b/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java
@@ -338,13 +338,44 @@ public class HwmfGraphics {
          if (dx == null || dx.length == 0) {\r
              addAttributes(as, font);\r
          } else {\r
-            for (int i=0; i<len; i++) {\r
+            int[] dxNormed = dx;\r
+            //for multi-byte encodings (e.g. Shift_JIS), the byte length\r
+            //might not equal the string length().\r
+            //The x information is stored in dx[], an array parallel to the\r
+            //byte array text[].  dx[] stores the x info in the\r
+            //first byte of a multibyte character, but dx[] stores 0\r
+            //for the other bytes in that character.\r
+            //We need to map this information to the String offsets\r
+            //dx[0] = 13 text[0] = -125\r
+            //dx[1] = 0  text[1] = 118\r
+            //dx[2] = 14 text[2] = -125\r
+            //dx[3] = 0  text[3] = -115\r
+            // needs to be remapped as:\r
+            //dxNormed[0] = 13 textString.get(0) = U+30D7\r
+            //dxNormed[1] = 14 textString.get(1) = U+30ED\r
+            if (textString.length() != text.length) {\r
+                int codePoints = textString.codePointCount(0, textString.length());\r
+                dxNormed = new int[codePoints];\r
+                int dxPosition = 0;\r
+                for (int offset = 0; offset < textString.length(); ) {\r
+                    dxNormed[offset] = dx[dxPosition];\r
+                    int[] chars = new int[1];\r
+                    int cp = textString.codePointAt(offset);\r
+                    chars[0] = cp;\r
+                    //now figure out how many bytes it takes to encode that\r
+                    //code point in the charset\r
+                    int byteLength = new String(chars, 0, chars.length).getBytes(charset).length;\r
+                    dxPosition += byteLength;\r
+                    offset += Character.charCount(cp);\r
+                }\r
+            }\r
+            for (int i = 0; i < dxNormed.length; i++) {\r
                  addAttributes(as, font);\r
                  // Tracking works as a prefix/advance space on characters whereas\r
                  // dx[...] is the complete width of the current char\r
                  // therefore we need to add the additional/suffix width to the next char\r
-                if (i<len-1) {\r
-                    as.addAttribute(TextAttribute.TRACKING, (dx[i]-fontW)/fontH, i+1, i+2);\r
+                if (i < dxNormed.length - 1) {\r
+                    as.addAttribute(TextAttribute.TRACKING, (dxNormed[i] - fontW) / fontH, i + 1, i + 2);\r
                  }\r
              }\r
          }\r
diff --git a/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java b/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java

index 2f0838f1286ce4f8024823f6f6836d075807f2eb..746c5b0218b321fe4e6bf2d8184489a2dce70775 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java
@@ -222,4 +222,30 @@ public class TestHwmfParsing {
          assertTrue(txt.contains("\u0411\u0430\u043B\u0430\u043D\u0441"));\r
      }\r
  \r
+    @Test\r
+    @Ignore("If we decide we can use the common crawl file attached to Bug 60677, " +\r
+            "we can turn this back on")\r
+    public void testShift_JIS() throws Exception {\r
+        //TODO: move test file to framework and fix this\r
+        File f = new File("C:/data/file8.wmf");\r
+        HwmfPicture wmf = new HwmfPicture(new FileInputStream(f));\r
+\r
+        Charset charset = LocaleUtil.CHARSET_1252;\r
+        StringBuilder sb = new StringBuilder();\r
+        //this is pure hackery for specifying the font\r
+        //this happens to work on this test file, but you need to\r
+        //do what Graphics does by maintaining the stack, etc.!\r
+        for (HwmfRecord r : wmf.getRecords()) {\r
+            if (r.getRecordType().equals(HwmfRecordType.createFontIndirect)) {\r
+                HwmfFont font = ((HwmfText.WmfCreateFontIndirect)r).getFont();\r
+                charset = (font.getCharSet().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset();\r
+            }\r
+            if (r.getRecordType().equals(HwmfRecordType.extTextOut)) {\r
+                HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut)r;\r
+                sb.append(textOut.getText(charset)).append("\n");\r
+            }\r
+        }\r
+        String txt = sb.toString();\r
+        assertTrue(txt.contains("\u822A\u7A7A\u60C5\u5831\u696D\u52D9\u3078\u306E\uFF27\uFF29\uFF33"));\r
+    }\r
  }\r
author	Tim Allison <tallison@apache.org>
	Fri, 3 Feb 2017 20:19:33 +0000 (20:19 +0000)
committer	Tim Allison <tallison@apache.org>
	Fri, 3 Feb 2017 20:19:33 +0000 (20:19 +0000)
src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java		patch \| blob \| history