aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorTim Allison <tallison@apache.org>2017-01-19 20:19:26 +0000
committerTim Allison <tallison@apache.org>2017-01-19 20:19:26 +0000
commitdc66cc51586ae5e3ec862d71543087b601abab48 (patch)
tree39abc66e14c348de927d39a078c4350b93270825 /src
parent5750c85be9a1b6939f12a32c896f5d80e801a73b (diff)
downloadpoi-dc66cc51586ae5e3ec862d71543087b601abab48.tar.gz
poi-dc66cc51586ae5e3ec862d71543087b601abab48.zip
Bug 60608 -- improve charset handling in Hwmf
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1779519 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
-rw-r--r--src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java37
-rw-r--r--src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java66
-rw-r--r--src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java60
-rw-r--r--src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java38
4 files changed, 161 insertions, 40 deletions
diff --git a/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java b/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java
index 5c0f35bd03..2716696dee 100644
--- a/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java
+++ b/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java
@@ -29,6 +29,7 @@ import java.awt.font.TextAttribute;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
+import java.nio.charset.Charset;
import java.text.AttributedString;
import java.util.ArrayList;
import java.util.LinkedList;
@@ -48,8 +49,11 @@ import org.apache.poi.hwmf.record.HwmfPenStyle.HwmfLineDash;
import org.apache.poi.sl.draw.DrawFactory;
import org.apache.poi.sl.draw.DrawFontManager;
import org.apache.poi.sl.draw.Drawable;
+import org.apache.poi.util.LocaleUtil;
public class HwmfGraphics {
+
+ private static final Charset DEFAULT_CHARSET = LocaleUtil.CHARSET_1252;
private final Graphics2D graphicsCtx;
private final List<HwmfDrawProperties> propStack = new LinkedList<HwmfDrawProperties>();
private HwmfDrawProperties prop = new HwmfDrawProperties();
@@ -311,14 +315,34 @@ public class HwmfGraphics {
break;
}
}
-
+
+ /**
+ *
+ * @param text
+ * @param bounds
+ * @deprecated use {@link #drawString(byte[], Rectangle2D)}
+ */
public void drawString(String text, Rectangle2D bounds) {
drawString(text, bounds, null);
}
-
+
+ public void drawString(byte[] text, Rectangle2D bounds) {
+ drawString(text, bounds, null);
+ }
+
+ /**
+ *
+ * @param text
+ * @param bounds
+ * @deprecated use {@link #drawString(byte[], Rectangle2D, int[])}
+ */
public void drawString(String text, Rectangle2D bounds, int dx[]) {
+ drawString(text.getBytes(DEFAULT_CHARSET), bounds, dx);
+ }
+
+ public void drawString(byte[] text, Rectangle2D bounds, int dx[]) {
HwmfFont font = prop.getFont();
- if (font == null || text == null || text.isEmpty()) {
+ if (font == null || text == null || text.length == 0) {
return;
}
@@ -326,8 +350,11 @@ public class HwmfGraphics {
// TODO: another approx. ...
double fontW = fontH/1.8;
- int len = text.length();
- AttributedString as = new AttributedString(text);
+ int len = text.length;
+ Charset charset = (font.getCharSet().getCharset() == null)?
+ DEFAULT_CHARSET : font.getCharSet().getCharset();
+ String textString = new String(text, charset);
+ AttributedString as = new AttributedString(textString);
if (dx == null || dx.length == 0) {
addAttributes(as, font);
} else {
diff --git a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java
index fce75a3656..9225de47ec 100644
--- a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java
+++ b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java
@@ -19,67 +19,93 @@ package org.apache.poi.hwmf.record;
import java.io.IOException;
import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
import org.apache.poi.util.LittleEndianConsts;
import org.apache.poi.util.LittleEndianInputStream;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
/**
* The Font object specifies the attributes of a logical font
*/
public class HwmfFont {
+
+ private static final POILogger logger = POILogFactory.getLogger(HwmfFont.class);
+
public enum WmfCharset {
/** Specifies the English character set. */
- ANSI_CHARSET(0x00000000),
+ ANSI_CHARSET(0x00000000, "Cp1252"),
/**
* Specifies a character set based on the current system locale;
* for example, when the system locale is United States English,
* the default character set is ANSI_CHARSET.
*/
- DEFAULT_CHARSET(0x00000001),
+ DEFAULT_CHARSET(0x00000001, "Cp1252"),
/** Specifies a character set of symbols. */
- SYMBOL_CHARSET(0x00000002),
+ SYMBOL_CHARSET(0x00000002, ""),
/** Specifies the Apple Macintosh character set. */
- MAC_CHARSET(0x0000004D),
+ MAC_CHARSET(0x0000004D, "MacRoman"),
/** Specifies the Japanese character set. */
- SHIFTJIS_CHARSET(0x00000080),
+ SHIFTJIS_CHARSET(0x00000080, "Shift_JIS"),
/** Also spelled "Hangeul". Specifies the Hangul Korean character set. */
- HANGUL_CHARSET(0x00000081),
+ HANGUL_CHARSET(0x00000081, "cp949"),
/** Also spelled "Johap". Specifies the Johab Korean character set. */
- JOHAB_CHARSET(0x00000082),
+ JOHAB_CHARSET(0x00000082, "x-Johab"),
/** Specifies the "simplified" Chinese character set for People's Republic of China. */
- GB2312_CHARSET(0x00000086),
+ GB2312_CHARSET(0x00000086, "GB2312"),
/**
* Specifies the "traditional" Chinese character set, used mostly in
* Taiwan and in the Hong Kong and Macao Special Administrative Regions.
*/
- CHINESEBIG5_CHARSET(0x00000088),
+ CHINESEBIG5_CHARSET(0x00000088, "Big5"),
/** Specifies the Greek character set. */
- GREEK_CHARSET(0x000000A1),
+ GREEK_CHARSET(0x000000A1, "Cp1253"),
/** Specifies the Turkish character set. */
- TURKISH_CHARSET(0x000000A2),
+ TURKISH_CHARSET(0x000000A2, "Cp1254"),
/** Specifies the Vietnamese character set. */
- VIETNAMESE_CHARSET(0x000000A3),
+ VIETNAMESE_CHARSET(0x000000A3, "Cp1258"),
/** Specifies the Hebrew character set. */
- HEBREW_CHARSET(0x000000B1),
+ HEBREW_CHARSET(0x000000B1, "Cp1255"),
/** Specifies the Arabic character set. */
- ARABIC_CHARSET(0x000000B2),
+ ARABIC_CHARSET(0x000000B2, "Cp1256"),
/** Specifies the Baltic (Northeastern European) character set. */
- BALTIC_CHARSET(0x000000BA),
+ BALTIC_CHARSET(0x000000BA, "Cp1257"),
/** Specifies the Russian Cyrillic character set. */
- RUSSIAN_CHARSET(0x000000CC),
+ RUSSIAN_CHARSET(0x000000CC, "Cp1251"),
/** Specifies the Thai character set. */
- THAI_CHARSET(0x000000DE),
+ THAI_CHARSET(0x000000DE, "x-windows-874"),
/** Specifies a Eastern European character set. */
- EASTEUROPE_CHARSET(0x000000EE),
+ EASTEUROPE_CHARSET(0x000000EE, "Cp1250"),
/**
* Specifies a mapping to one of the OEM code pages,
* according to the current system locale setting.
*/
- OEM_CHARSET(0x000000FF);
+ OEM_CHARSET(0x000000FF, "Cp1252");
int flag;
- WmfCharset(int flag) {
+ Charset charset;
+
+ WmfCharset(int flag, String javaCharsetName) {
this.flag = flag;
+ if (javaCharsetName.length() > 0) {
+ try {
+ charset = Charset.forName(javaCharsetName);
+ return;
+ } catch (UnsupportedCharsetException e) {
+ logger.log(POILogger.WARN, "Unsupported charset: "+javaCharsetName);
+ }
+ }
+ charset = null;
+ }
+
+ /**
+ *
+ * @return charset for the font or <code>null</code> if there is no matching charset or
+ * if the charset is a &quot;default&quot;
+ */
+ public Charset getCharset() {
+ return charset;
}
static WmfCharset valueOf(int flag) {
diff --git a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java
index 03ebc353c4..108970d761 100644
--- a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java
+++ b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java
@@ -19,6 +19,7 @@ package org.apache.poi.hwmf.record;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
+import java.nio.charset.Charset;
import org.apache.poi.hwmf.draw.HwmfDrawProperties;
import org.apache.poi.hwmf.draw.HwmfGraphics;
@@ -27,7 +28,6 @@ import org.apache.poi.util.BitField;
import org.apache.poi.util.BitFieldFactory;
import org.apache.poi.util.LittleEndianConsts;
import org.apache.poi.util.LittleEndianInputStream;
-import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
@@ -144,7 +144,7 @@ public class HwmfText {
* length of the string.
* The string is written at the location specified by the XStart and YStart fields.
*/
- private String text;
+ private byte[] rawTextBytes;
/**
* A 16-bit signed integer that defines the vertical (y-axis) coordinate, in logical
* units, of the point where drawing is to start.
@@ -164,18 +164,33 @@ public class HwmfText {
@Override
public int init(LittleEndianInputStream leis, long recordSize, int recordFunction) throws IOException {
stringLength = leis.readShort();
- byte buf[] = new byte[stringLength+(stringLength&1)];
- leis.readFully(buf);
- text = new String(buf, 0, stringLength, LocaleUtil.CHARSET_1252).trim();
+ rawTextBytes = new byte[stringLength+(stringLength&1)];
+ leis.readFully(rawTextBytes);
yStart = leis.readShort();
xStart = leis.readShort();
- return 3*LittleEndianConsts.SHORT_SIZE+buf.length;
+ return 3*LittleEndianConsts.SHORT_SIZE+rawTextBytes.length;
}
@Override
public void draw(HwmfGraphics ctx) {
Rectangle2D bounds = new Rectangle2D.Double(xStart, yStart, 0, 0);
- ctx.drawString(text, bounds);
+ ctx.drawString(getTextBytes(), bounds);
+ }
+
+ public String getText(Charset charset) {
+ return new String(getTextBytes(), charset);
+ }
+
+ /**
+ *
+ * @return a copy of a trimmed byte array of rawTextBytes bytes.
+ * This includes only the bytes from 0..stringLength.
+ * This does not include the extra optional padding on the byte array.
+ */
+ private byte[] getTextBytes() {
+ byte[] ret = new byte[stringLength];
+ System.arraycopy(rawTextBytes, 0, ret, 0, stringLength);
+ return ret;
}
}
@@ -264,7 +279,7 @@ public class HwmfText {
* the length is odd, an extra byte is placed after it so that the following member (optional Dx) is
* aligned on a 16-bit boundary.
*/
- private String text;
+ private byte[] rawTextBytes;
/**
* An optional array of 16-bit signed integers that indicate the distance between
* origins of adjacent character cells. For example, Dx[i] logical units separate the origins of
@@ -300,10 +315,9 @@ public class HwmfText {
size += 4*LittleEndianConsts.SHORT_SIZE;
}
- byte buf[] = new byte[stringLength+(stringLength&1)];
- leis.readFully(buf);
- text = new String(buf, 0, stringLength, LocaleUtil.CHARSET_1252);
- size += buf.length;
+ rawTextBytes = new byte[stringLength+(stringLength&1)];
+ leis.readFully(rawTextBytes);
+ size += rawTextBytes.length;
if (size >= remainingRecordSize) {
logger.log(POILogger.INFO, "META_EXTTEXTOUT doesn't contain character tracking info");
@@ -327,7 +341,23 @@ public class HwmfText {
@Override
public void draw(HwmfGraphics ctx) {
Rectangle2D bounds = new Rectangle2D.Double(x, y, 0, 0);
- ctx.drawString(text, bounds, dx);
+ ctx.drawString(getTextBytes(), bounds, dx);
+ }
+
+ public String getText(Charset charset) {
+ return new String(getTextBytes(), charset);
+ }
+
+ /**
+ *
+ * @return a copy of a trimmed byte array of rawTextBytes bytes.
+ * This includes only the bytes from 0..stringLength.
+ * This does not include the extra optional padding on the byte array.
+ */
+ private byte[] getTextBytes() {
+ byte[] ret = new byte[stringLength];
+ System.arraycopy(rawTextBytes, 0, ret, 0, stringLength);
+ return ret;
}
}
@@ -523,5 +553,9 @@ public class HwmfText {
public void applyObject(HwmfGraphics ctx) {
ctx.getProperties().setFont(font);
}
+
+ public HwmfFont getFont() {
+ return font;
+ }
}
}
diff --git a/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java b/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java
index 17226a654b..2f0838f128 100644
--- a/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java
@@ -18,7 +18,9 @@
package org.apache.poi.hwmf;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import javax.imageio.ImageIO;
import java.awt.Dimension;
import java.awt.Graphics2D;
import java.awt.RenderingHints;
@@ -31,21 +33,24 @@ import java.io.FileOutputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.net.URL;
+import java.nio.charset.Charset;
import java.util.List;
import java.util.Locale;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
-import javax.imageio.ImageIO;
-
import org.apache.poi.POIDataSamples;
import org.apache.poi.hwmf.record.HwmfFill.HwmfImageRecord;
+import org.apache.poi.hwmf.record.HwmfFont;
import org.apache.poi.hwmf.record.HwmfRecord;
+import org.apache.poi.hwmf.record.HwmfRecordType;
+import org.apache.poi.hwmf.record.HwmfText;
import org.apache.poi.hwmf.usermodel.HwmfPicture;
import org.apache.poi.sl.usermodel.PictureData;
import org.apache.poi.sl.usermodel.PictureData.PictureType;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory;
+import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.Units;
import org.junit.Ignore;
import org.junit.Test;
@@ -188,4 +193,33 @@ public class TestHwmfParsing {
}
}
}
+
+ @Test
+ @Ignore("If we decide we can use common crawl file specified, we can turn this back on")
+ public void testCyrillic() throws Exception {
+ //TODO: move test file to framework and fix this
+ File dir = new File("C:/somethingOrOther");
+ File f = new File(dir, "ZMLH54SPLI76NQ7XMKVB7SMUJA2HTXTS-2.wmf");
+ HwmfPicture wmf = new HwmfPicture(new FileInputStream(f));
+
+ Charset charset = LocaleUtil.CHARSET_1252;
+ StringBuilder sb = new StringBuilder();
+ //this is pure hackery for specifying the font
+ //this happens to work on this test file, but you need to
+ //do what Graphics does by maintaining the stack, etc.!
+ for (HwmfRecord r : wmf.getRecords()) {
+ if (r.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
+ HwmfFont font = ((HwmfText.WmfCreateFontIndirect)r).getFont();
+ charset = (font.getCharSet().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset();
+ }
+ if (r.getRecordType().equals(HwmfRecordType.extTextOut)) {
+ HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut)r;
+ sb.append(textOut.getText(charset)).append("\n");
+ }
+ }
+ String txt = sb.toString();
+ assertTrue(txt.contains("\u041E\u0431\u0449\u043E"));
+ assertTrue(txt.contains("\u0411\u0430\u043B\u0430\u043D\u0441"));
+ }
+
}