diff options
24 files changed, 971 insertions, 64 deletions
diff --git a/src/integrationtest/org/apache/poi/TestAllFiles.java b/src/integrationtest/org/apache/poi/TestAllFiles.java index 25ac41a9be..19edc1455b 100644 --- a/src/integrationtest/org/apache/poi/TestAllFiles.java +++ b/src/integrationtest/org/apache/poi/TestAllFiles.java @@ -218,6 +218,9 @@ public class TestAllFiles { "document/Word6_sections2.doc", "document/Word95.doc", "document/word95err.doc", + "document/Bug60936.doc", + "document/Bug60942.doc", + "document/Bug60942b.doc", "hpsf/TestMickey.doc", "document/52117.doc" ); diff --git a/src/java/org/apache/poi/util/CodePageUtil.java b/src/java/org/apache/poi/util/CodePageUtil.java index 145929182a..5be1c5077e 100644 --- a/src/java/org/apache/poi/util/CodePageUtil.java +++ b/src/java/org/apache/poi/util/CodePageUtil.java @@ -18,6 +18,9 @@ package org.apache.poi.util; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; +import java.util.HashSet; +import java.util.Set; /** * Utilities for working with Microsoft CodePages. @@ -27,6 +30,13 @@ import java.io.UnsupportedEncodingException; */ public class CodePageUtil { + + public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new HashSet<Charset>(); + static { + //others? + VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5); + } + /** <p>Codepage 037, a special case</p> */ public static final int CP_037 = 37; diff --git a/src/java/org/apache/poi/util/LittleEndianBig5Stream.java b/src/java/org/apache/poi/util/LittleEndianBig5Stream.java new file mode 100644 index 0000000000..f68b1cdb9e --- /dev/null +++ b/src/java/org/apache/poi/util/LittleEndianBig5Stream.java @@ -0,0 +1,107 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.util; + +import java.io.ByteArrayInputStream; + +/** + * Stream that converts MSOffice's way of storing Big5, with + * zero-byte padding for ASCII and in LittleEndianOrder. + */ +@Internal +public class LittleEndianBig5Stream extends ByteArrayInputStream { + private static final int EOF = -1; + private static final int INVALID_PAIR = -2; + private static final int EMPTY_TRAILING = -3; + + //the char that is logically trailing in Big5 encoding + //however in LittleEndian order, this is the first encountered. + int trailing = EMPTY_TRAILING; + public LittleEndianBig5Stream(byte[] buf) { + super(buf); + } + + public LittleEndianBig5Stream(byte[] buf, int offset, int length) { + super(buf, offset, length); + } + + @Override + public int read() { + + if (trailing != EMPTY_TRAILING) { + int tmp = trailing; + trailing = EMPTY_TRAILING; + return tmp; + } + int leading = readNext(); + while (leading == INVALID_PAIR) { + leading = readNext(); + } + + if (leading == EOF) { + return EOF; + } + return leading; + } + + //returns leading, sets trailing appropriately + //returns -1 if it hits the end of the stream + //returns -2 for an invalid big5 code pair + private final int readNext() { + trailing = super.read(); + if (trailing == -1) { + return EOF; + } + int leading = super.read(); + if (leading == EOF) { + return EOF; + } + int lead = leading&0xff; + if (lead > 0x80) { + return leading; + } else if (lead == 0) { + int ret = trailing; + trailing = EMPTY_TRAILING; + return ret; + } else { + int ret = trailing; + trailing = EMPTY_TRAILING; + return ret; + //return INVALID_PAIR; + } + + } + + @Override + public int read(byte[] buff, int off, int len) { + int bytesRead = 0; + for (int i = off; i < off+len; i++) { + int b = read(); + if (b == -1) { + if (bytesRead == 0) { + return -1; + } else { + return bytesRead; + } + } + bytesRead++; + buff[i] = (byte)b; + } + return bytesRead; + } +} diff --git a/src/java/org/apache/poi/util/StringUtil.java b/src/java/org/apache/poi/util/StringUtil.java index 20a6824c9b..5d09dff56d 100644 --- a/src/java/org/apache/poi/util/StringUtil.java +++ b/src/java/org/apache/poi/util/StringUtil.java @@ -17,6 +17,8 @@ package org.apache.poi.util; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Iterator; @@ -27,9 +29,14 @@ import java.util.Map; */ @Internal public class StringUtil { + + private static final POILogger logger = POILogFactory + .getLogger(StringUtil.class); protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1"); - protected static final Charset UTF16LE = Charset.forName("UTF-16LE"); + public static final Charset UTF16LE = Charset.forName("UTF-16LE"); public static final Charset UTF8 = Charset.forName("UTF-8"); + public static final Charset WIN_1252 = Charset.forName("cp1252"); + public static final Charset BIG5 = Charset.forName("Big5"); private static Map<Integer,Integer> msCodepointToUnicode; @@ -573,7 +580,28 @@ public class StringUtil { 9133, // 0xf0fe bracerightbt ' ', // 0xf0ff not defined }; - + + /** + * This tries to convert a LE byte array in Big5 to a String. + * We know MS zero-padded ascii, and we drop those. + * However, there may be areas for improvement in this. + * + * @param data + * @param offset + * @param lengthInBytes + * @return + */ + public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) { + ByteArrayOutputStream os = new ByteArrayOutputStream(); + try { + IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os); + } catch (IOException e) { + logger.log(POILogger.WARN, + "IOException while copying a byte array stream to a byte array stream?!"); + } + return new String(os.toByteArray(), BIG5); + } + // Could be replaced with org.apache.commons.lang3.StringUtils#join @Internal public static String join(Object[] array, String separator) { diff --git a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java index 703faa153a..f6e2563817 100644 --- a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java +++ b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java @@ -108,7 +108,7 @@ public class HwmfFont { return charset; } - static WmfCharset valueOf(int flag) { + public static WmfCharset valueOf(int flag) { for (WmfCharset cs : values()) { if (cs.flag == flag) return cs; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 6ff9f29bc4..505789e2c3 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -19,27 +19,43 @@ package org.apache.poi.hwpf; import java.io.File; import java.io.IOException; import java.io.OutputStream; +import java.nio.charset.Charset; +import org.apache.poi.hwmf.record.HwmfFont; import org.apache.poi.hwpf.model.ComplexFileTable; +import org.apache.poi.hwpf.model.FontTable; import org.apache.poi.hwpf.model.OldCHPBinTable; +import org.apache.poi.hwpf.model.OldComplexFileTable; +import org.apache.poi.hwpf.model.OldFfn; +import org.apache.poi.hwpf.model.OldFontTable; import org.apache.poi.hwpf.model.OldPAPBinTable; import org.apache.poi.hwpf.model.OldSectionTable; +import org.apache.poi.hwpf.model.OldTextPieceTable; import org.apache.poi.hwpf.model.PieceDescriptor; import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.model.TextPieceTable; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.CodePageUtil; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.NotImplemented; +import org.apache.poi.util.StringUtil; /** * Provides very simple support for old (Word 6 / Word 95) * files. */ public class HWPFOldDocument extends HWPFDocumentCore { - private TextPieceTable tpt; + + private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252; + + private OldTextPieceTable tpt; private StringBuilder _text; + + private final OldFontTable fontTable; + private final Charset guessedCharset; public HWPFOldDocument(POIFSFileSystem fs) throws IOException { this(fs.getRoot()); @@ -56,45 +72,52 @@ public class HWPFOldDocument extends HWPFDocumentCore { int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc); int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0); int papTableSize = LittleEndian.getInt(_mainStream, 0xc4); - //int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60); - //int shfTableSize = LittleEndian.getInt(_mainStream, 0x64); + int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0); + int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4); + + fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize); + //TODO: figure out how to map runs/text pieces to fonts + //for now, if there's a non standard codepage in one of the fonts + //assume that the doc is in that codepage. + guessedCharset = guessCodePage(fontTable); + int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160); // We need to get hold of the text that makes up the // document, which might be regular or fast-saved ComplexFileTable cft = null; - StringBuffer text = new StringBuffer(); if(_fib.getFibBase().isFComplex()) { - cft = new ComplexFileTable( + cft = new OldComplexFileTable( _mainStream, _mainStream, - complexTableOffset, _fib.getFibBase().getFcMin() + complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset ); - tpt = cft.getTextPieceTable(); + tpt = (OldTextPieceTable)cft.getTextPieceTable(); - for(TextPiece tp : tpt.getTextPieces()) { - text.append( tp.getStringBuilder() ); - } } else { // TODO Discover if these older documents can ever hold Unicode Strings? // (We think not, because they seem to lack a Piece table) // TODO Build the Piece Descriptor properly // (We have to fake it, as they don't seem to have a proper Piece table) - PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0); + PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset); pd.setFilePosition(_fib.getFibBase().getFcMin()); // Generate a single Text Piece Table, with a single Text Piece // which covers all the (8 bit only) text in the file - tpt = new TextPieceTable(); + tpt = new OldTextPieceTable(); byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()]; System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length); + + int numChars = textData.length; + if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) { + numChars /= 2; + } + TextPiece tp = new TextPiece( - 0, textData.length, textData, pd + 0, numChars, textData, pd ); tpt.add(tp); - text.append(tp.getStringBuilder()); } - _text = tpt.getText(); // Now we can fetch the character and paragraph properties @@ -133,12 +156,54 @@ public class HWPFOldDocument extends HWPFDocumentCore { } } + + /** + * Take the first codepage that is not default, ansi or symbol. + * Ideally, we'd want to track fonts with runs, but we don't yet + * know how to do that. + * + * Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi + * appears here. + * + * @param fontTable + * @return + */ + private Charset guessCodePage(OldFontTable fontTable) { + + for (OldFfn oldFfn : fontTable.getFontNames()) { + HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff); + if (wmfCharset != null && + wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET && + wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET && + wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) { + return wmfCharset.getCharset(); + } + } + return DEFAULT_CHARSET; + } + public Range getOverallRange() { // Life is easy when we have no footers, headers or unicode! return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this ); } + /** + * Use {@link #getOldFontTable()} instead!!! + * This always throws an IllegalArgumentException. + * + * @return nothing + * @throws UnsupportedOperationException + */ + @Override + @NotImplemented + public FontTable getFontTable() { + throw new UnsupportedOperationException("Use getOldFontTable instead."); + } + + public OldFontTable getOldFontTable() { + return fontTable; + } public Range getRange() { return getOverallRange(); @@ -167,4 +232,19 @@ public class HWPFOldDocument extends HWPFDocumentCore { public void write(OutputStream out) throws IOException { throw new IllegalStateException("Writing is not available for the older file formats"); } + + /** + * As a rough heuristic (total hack), read through the font table + * and take the first non-default, non-ansi, non-symbol + * font's charset and return that. + * + * Once we figure out how to link a font to a text piece, we should + * use the font information per text piece. + * + * @return charset + */ + public Charset getGuessedCharset() { + return guessedCharset; + } + } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java index 42a2fb9875..dc530bd647 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java @@ -18,6 +18,7 @@ package org.apache.poi.hwpf.model; import java.io.IOException; +import java.nio.charset.Charset; import java.util.LinkedList; import java.util.List; @@ -26,9 +27,10 @@ import org.apache.poi.hwpf.model.io.HWPFOutputStream; import org.apache.poi.hwpf.sprm.SprmBuffer; import org.apache.poi.util.Internal; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.StringUtil; @Internal -public final class ComplexFileTable { +public class ComplexFileTable { private static final byte GRPPRL_TYPE = 1; private static final byte TEXT_PIECE_TABLE_TYPE = 2; @@ -40,7 +42,8 @@ public final class ComplexFileTable { _tpt = new TextPieceTable(); } - public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException { + protected ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin, + Charset charset) throws IOException { //skips through the prms before we reach the piece table. These contain data //for actual fast saved files List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>(); @@ -61,7 +64,12 @@ public final class ComplexFileTable { } int pieceTableSize = LittleEndian.getInt(tableStream, ++offset); offset += LittleEndian.INT_SIZE; - _tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin); + _tpt = newTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset); + + } + + public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException { + this(documentStream, tableStream, offset, fcMin, StringUtil.WIN_1252); } public TextPieceTable getTextPieceTable() { @@ -92,4 +100,11 @@ public final class ComplexFileTable { tableStream.write(table); } + protected TextPieceTable newTextPieceTable(byte[] documentStream, + byte[] tableStream, int offset, int pieceTableSize, int fcMin, + Charset charset) { + return new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin); + } + + } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java index bc3f4869bc..45061ad65b 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java @@ -44,7 +44,7 @@ public final class OldCHPBinTable extends CHPBinTable * @param fcMin */ public OldCHPBinTable(byte[] documentStream, int offset, - int size, int fcMin, TextPieceTable tpt) + int size, int fcMin, OldTextPieceTable tpt) { PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java new file mode 100644 index 0000000000..25510c89e9 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java @@ -0,0 +1,42 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +import java.io.IOException; +import java.nio.charset.Charset; + +import org.apache.poi.util.Internal; + +@Internal +public final class OldComplexFileTable extends ComplexFileTable { + + public OldComplexFileTable(byte[] documentStream, byte[] tableStream, + int offset, int fcMin, Charset charset) throws IOException { + super(documentStream, tableStream, offset, fcMin, charset); + } + + + @Override + protected TextPieceTable newTextPieceTable(byte[] documentStream, + byte[] tableStream, int offset, + int pieceTableSize, int fcMin, Charset charset) { + return new OldTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset); + } + + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java new file mode 100644 index 0000000000..d50ac4ec0e --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java @@ -0,0 +1,161 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +import java.nio.charset.Charset; + +import org.apache.poi.hwmf.record.HwmfFont; +import org.apache.poi.util.Internal; +import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; +import org.apache.poi.util.StringUtil; + +/** + * Word 6.0 Font information + */ +@Internal +public final class OldFfn { + + private static final POILogger logger = POILogFactory.getLogger(OldFfn.class); + + private byte _chs;// character set identifier + + private final String fontName; + private final String altFontName; + + private final int length; //length in bytes for this record + + /** + * try to read an OldFfn starting at offset; read no farther than end + * + * @param buf buffer from which to read + * @param offset offset at which to start + * @param fontTableEnd read no farther than this + * @return an OldFfn or null if asked to read beyond end + */ + static OldFfn build(byte[] buf, int offset, int fontTableEnd) { + int start = offset; + //preliminary bytes + if (offset + 6 > fontTableEnd) { + return null; + } + //first byte + short fontDescriptionLength = (short) buf[offset]; + offset += 1; + if (offset + fontDescriptionLength > fontTableEnd) { + logger.log(POILogger.WARN, "Asked to read beyond font table end. Skipping font"); + return null; + } + + //no idea what these 3 bytes do + offset += 3; + byte chs = buf[offset]; + Charset charset = null; + HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(chs & 0xff); + if (wmfCharset == null) { + logger.log(POILogger.WARN, "Couldn't find font for type: " + (chs & 0xff)); + } else { + charset = wmfCharset.getCharset(); + } + charset = charset == null ? StringUtil.WIN_1252 : charset; + offset += LittleEndian.BYTE_SIZE; + //if this byte here == 7, it _may_ signify existence of + //an altername font name + + //not sure what the byte after the _chs does + offset += LittleEndian.BYTE_SIZE; + int fontNameLength = -1; + for (int i = offset; i < fontTableEnd; i++) { + if (buf[i] == 0) { + fontNameLength = i - offset; + break; + } + } + if (fontNameLength == -1) { + logger.log(POILogger.WARN, "Couldn't find the zero-byte delimited font name length"); + return null; + } + String fontName = new String(buf, offset, fontNameLength, charset); + String altFontName = null; + int altFontNameLength = -1; + offset += fontNameLength + 1; + if (offset - start < fontDescriptionLength) { + for (int i = offset; i <= start + fontDescriptionLength; i++) { + if (buf[i] == 0) { + altFontNameLength = i - offset; + break; + } + } + if (altFontNameLength > -1) { + altFontName = new String(buf, offset, altFontNameLength, charset); + } + } + //reset to 0 for length calculation + altFontNameLength = (altFontNameLength < 0) ? 0 : altFontNameLength + 1;//add one for zero byte + + int len = LittleEndian.INT_SIZE + LittleEndian.BYTE_SIZE + LittleEndian.BYTE_SIZE +//6 starting bytes + fontNameLength + altFontNameLength + 1;//+1 is for the zero byte + //this len should == fontDescriptionLength + + return new OldFfn(chs, fontName, altFontName, len); + + } + + public OldFfn(byte charsetIdentifier, String fontName, String altFontName, int length) { + this._chs = charsetIdentifier; + this.fontName = fontName; + this.altFontName = altFontName; + this.length = length; + } + + public byte getChs() { + return _chs; + } + + public String getMainFontName() { + return fontName; + } + + /** + * @return altFontName if it exists, null otherwise + */ + public String getAltFontName() { + return altFontName; + } + + + /** + * @return length in bytes for this record + */ + public int getLength() { + return length; + } + + @Override + public String toString() { + return "OldFfn{" + + "_chs=" + (_chs & 0xff) + + ", fontName='" + fontName + '\'' + + ", altFontName='" + altFontName + '\'' + + ", length=" + length + + '}'; + } +} + + diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java new file mode 100644 index 0000000000..dfe1f95e0e --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java @@ -0,0 +1,84 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.poi.util.Internal; +import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + +/** + * Font table for Word 6.0 + */ +@Internal +public final class OldFontTable { + private final static POILogger _logger = POILogFactory.getLogger(OldFontTable.class); + + // added extra facilitator members + // FFN structure containing strings of font names + private final OldFfn[] _fontNames; + + public OldFontTable(byte[] buf, int offset, int length) { + //length is stored at the index section in the table + //and it is recorded in the first short. + + + List<OldFfn> ffns = new ArrayList<OldFfn>(); + int fontTableLength = LittleEndian.getShort(buf, offset); + + int endOfTableOffset = offset + length; + int startOffset = offset + LittleEndian.SHORT_SIZE;//first short should == length! + + while (true) { + OldFfn oldFfn = OldFfn.build(buf, startOffset, endOfTableOffset); + if (oldFfn == null) { + break; + } + ffns.add(oldFfn); + startOffset += oldFfn.getLength(); + + } + _fontNames = ffns.toArray(new OldFfn[ffns.size()]); + } + + + public OldFfn[] getFontNames() { + return _fontNames; + } + + + public String getMainFont(int chpFtc) { + if (chpFtc >= _fontNames.length) { + _logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount"); + return null; + } + + return _fontNames[chpFtc].getMainFontName(); + } + + @Override + public String toString() { + return "OldFontTable{" + + "_fontNames=" + Arrays.toString(_fontNames) + + '}'; + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java new file mode 100644 index 0000000000..c82635bc31 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java @@ -0,0 +1,120 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + + +import org.apache.poi.util.Internal; +import org.apache.poi.util.NotImplemented; + +/** + * Lightweight representation of a text piece. + * Works in the character domain, not the byte domain, so you + * need to have turned byte references into character + * references before getting here. + */ +@Internal +public class OldTextPiece extends TextPiece { + + private final byte[] rawBytes; + + /** + * @param start Beginning offset in main document stream, in characters. + * @param end Ending offset in main document stream, in characters. + * @param text The raw bytes of our text + */ + public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) { + super(start, end, text, pd); + this.rawBytes = text; + if (end < start) { + throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end); + } + } + + /** + * @return nothing, ever. Always throws an UnsupportedOperationException + * @throws UnsupportedOperationException + */ + @NotImplemented + @Override + public boolean isUnicode() { + throw new UnsupportedOperationException(); + } + + + public StringBuilder getStringBuilder() { + return (StringBuilder) _buf; + } + + @Override + public byte[] getRawBytes() { + byte[] buf = new byte[rawBytes.length]; + System.arraycopy(rawBytes, 0, buf, 0, rawBytes.length); + return buf; + } + + /** + * Returns part of the string. + * Works only in characters, not in bytes! + * + * @param start Local start position, in characters + * @param end Local end position, in characters + * @throws UnsupportedOperationException + */ + @Deprecated + @NotImplemented + public String substring(int start, int end) { + throw new UnsupportedOperationException(); + } + + /** + * Not implemented for OldTextPiece. + * Always throws UnsupportedOperationException + */ + @Deprecated + @NotImplemented + public void adjustForDelete(int start, int length) { + throw new UnsupportedOperationException(); + } + + /** + * Returns the length, in bytes + */ + public int bytesLength() { + return rawBytes.length; + } + + @Override + public int hashCode() { + assert false : "hashCode not designed"; + return 42; // any arbitrary constant will do + } + + + /** + * Returns the character position we start at. + */ + public int getCP() { + return getStart(); + } + + public String toString() { + return "OldTextPiece from " + getStart() + " to " + getEnd() + " (" + + getPieceDescriptor() + ")"; + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java new file mode 100644 index 0000000000..3fd34ade09 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java @@ -0,0 +1,119 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.model; + +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; + +import org.apache.poi.util.CodePageUtil; +import org.apache.poi.util.Internal; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + + +@Internal +public class OldTextPieceTable extends TextPieceTable { + + private static final POILogger logger = POILogFactory + .getLogger(OldTextPieceTable.class); + + public OldTextPieceTable() { + super(); + } + + public OldTextPieceTable(byte[] documentStream, byte[] tableStream, + int offset, int size, int fcMin, Charset charset) { + //super(documentStream, tableStream, offset, size, fcMin, charset); + // get our plex of PieceDescriptors + PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, + PieceDescriptor.getSizeInBytes()); + + int length = pieceTable.length(); + PieceDescriptor[] pieces = new PieceDescriptor[length]; + + // iterate through piece descriptors raw bytes and create + // PieceDescriptor objects + for (int x = 0; x < length; x++) { + GenericPropertyNode node = pieceTable.getProperty(x); + pieces[x] = new PieceDescriptor(node.getBytes(), 0, charset); + } + + // Figure out the cp of the earliest text piece + // Note that text pieces don't have to be stored in order! + _cpMin = pieces[0].getFilePosition() - fcMin; + for (PieceDescriptor piece : pieces) { + int start = piece.getFilePosition() - fcMin; + if (start < _cpMin) { + _cpMin = start; + } + } + + // using the PieceDescriptors, build our list of TextPieces. + for (int x = 0; x < pieces.length; x++) { + int start = pieces[x].getFilePosition(); + GenericPropertyNode node = pieceTable.getProperty(x); + + // Grab the start and end, which are in characters + int nodeStartChars = node.getStart(); + int nodeEndChars = node.getEnd(); + + // What's the relationship between bytes and characters? + boolean unicode = pieces[x].isUnicode(); + int multiple = 1; + if (unicode || + (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) { + multiple = 2; + } + + // Figure out the length, in bytes and chars + int textSizeChars = (nodeEndChars - nodeStartChars); + int textSizeBytes = textSizeChars * multiple; + + // Grab the data that makes up the piece + byte[] buf = new byte[textSizeBytes]; + System.arraycopy(documentStream, start, buf, 0, textSizeBytes); + + // And now build the piece + final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf, + pieces[x]); + + _textPieces.add(newTextPiece); + } + + // In the interest of our sanity, now sort the text pieces + // into order, if they're not already + Collections.sort(_textPieces); + _textPiecesFCOrder = new ArrayList<TextPiece>(_textPieces); + Collections.sort(_textPiecesFCOrder, new FCComparator()); + + } + + @Override + protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) { + return new OldTextPiece(nodeStartChars, nodeEndChars, buf, pd); + } + + @Override + protected int getEncodingMultiplier(TextPiece textPiece) { + Charset charset = textPiece.getPieceDescriptor().getCharset(); + if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) { + return 2; + } + return 1; + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java index 34c29511c3..3979009f20 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java @@ -260,7 +260,7 @@ public class PAPBinTable SprmBuffer sprmBuffer = null; for ( PAPX papx : papxs ) { - if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 ) + if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 ) continue; if ( sprmBuffer == null ) { diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java index a190f1db03..53dcc17457 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java @@ -17,10 +17,13 @@ package org.apache.poi.hwpf.model; +import java.nio.charset.Charset; + import org.apache.poi.util.BitField; import org.apache.poi.util.BitFieldFactory; import org.apache.poi.util.Internal; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.StringUtil; @Internal public final class PieceDescriptor @@ -32,29 +35,51 @@ public final class PieceDescriptor private static BitField fCopied = BitFieldFactory.getInstance(0x04); int fc; PropertyModifier prm; - boolean unicode; - + boolean unicode = false; + private final Charset charset; - public PieceDescriptor(byte[] buf, int offset) - { - descriptor = LittleEndian.getShort(buf, offset); - offset += LittleEndian.SHORT_SIZE; - fc = LittleEndian.getInt(buf, offset); - offset += LittleEndian.INT_SIZE; - prm = new PropertyModifier( LittleEndian.getShort(buf, offset)); - // see if this piece uses unicode. - if ((fc & 0x40000000) == 0) - { - unicode = true; - } - else - { - unicode = false; - fc &= ~(0x40000000);//gives me FC in doc stream - fc /= 2; + public PieceDescriptor(byte[] buf, int offset) { + this(buf, offset, null); } + /** + * + * This initializer should only be used for HWPFOldDocuments. + * + * @param buf + * @param offset + * @param charset which charset to use if this is not unicode + */ + public PieceDescriptor(byte[] buf, int offset, Charset charset) { + descriptor = LittleEndian.getShort(buf, offset); + offset += LittleEndian.SHORT_SIZE; + fc = LittleEndian.getInt(buf, offset); + offset += LittleEndian.INT_SIZE; + prm = new PropertyModifier(LittleEndian.getShort(buf, offset)); + if (charset == null) { + // see if this piece uses unicode. + //From the documentation: If the second most significant bit + //is clear, then this indicates the actual file offset of the Unicode character (two bytes). If the + //second most significant bit is set, then the actual address of the codepage-1252 + //compressed version of the Unicode character (one byte), is actually at the offset indicated + //by clearing this bit and dividing by two. + if ((fc & 0x40000000) == 0) { + unicode = true; + this.charset = null; + } else { + unicode = false; + fc &= ~(0x40000000);//gives me FC in doc stream + fc /= 2; + this.charset = StringUtil.WIN_1252; + } + } else { + if (charset == StringUtil.UTF16LE) { + unicode = true; + } + this.charset = charset; + } + } public int getFilePosition() @@ -72,6 +97,15 @@ public final class PieceDescriptor return unicode; } + /** + * + * @return charset to use if this is not a Unicode PieceDescriptor + * this can be <code>null</code> + */ + public Charset getCharset() { + return charset; + } + public PropertyModifier getPrm() { return prm; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java index d432f35b6f..2a63bda16b 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java @@ -21,6 +21,7 @@ package org.apache.poi.hwpf.model; import java.nio.charset.Charset; import org.apache.poi.util.Internal; +import org.apache.poi.util.StringUtil; /** * Lightweight representation of a text piece. @@ -40,7 +41,6 @@ public class TextPiece extends PropertyNode<TextPiece> { * @param start Beginning offset in main document stream, in characters. * @param end Ending offset in main document stream, in characters. * @param text The raw bytes of our text - * @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)} * instead */ public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, @@ -72,8 +72,13 @@ public class TextPiece extends PropertyNode<TextPiece> { * Create the StringBuilder from the text and unicode flag */ private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) { - String str = new String(text, Charset.forName(pd.isUnicode() ? "UTF-16LE" : "Cp1252")); + byte[] textBuffer = text; + if (StringUtil.BIG5.equals(pd.getCharset())) { + String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString(); + return new StringBuilder(txt); + } + String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset()); return new StringBuilder(str); } @@ -207,4 +212,5 @@ public class TextPiece extends PropertyNode<TextPiece> { return "TextPiece from " + getStart() + " to " + getEnd() + " (" + getPieceDescriptor() + ")"; } + } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java index 0108877c7b..bbddd86459 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java @@ -101,7 +101,7 @@ public class TextPieceTable implements CharIndexTranslator { System.arraycopy(documentStream, start, buf, 0, textSizeBytes); // And now build the piece - final TextPiece newTextPiece = new TextPiece(nodeStartChars, nodeEndChars, buf, + final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf, pieces[x]); _textPieces.add(newTextPiece); @@ -114,6 +114,10 @@ public class TextPieceTable implements CharIndexTranslator { Collections.sort(_textPiecesFCOrder, new FCComparator()); } + protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) { + return new TextPiece(nodeStartChars, nodeEndChars, buf, pd); + } + public void add(TextPiece piece) { _textPieces.add(piece); _textPiecesFCOrder.add(piece); @@ -249,7 +253,7 @@ public class TextPieceTable implements CharIndexTranslator { if (rangeStartBytes > rangeEndBytes) continue; - final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1; + final int encodingMultiplier = getEncodingMultiplier(textPiece); final int rangeStartCp = textPiece.getStart() + (rangeStartBytes - tpStart) / encodingMultiplier; @@ -262,6 +266,10 @@ public class TextPieceTable implements CharIndexTranslator { return result.toArray(new int[result.size()][]); } + protected int getEncodingMultiplier(TextPiece textPiece) { + return textPiece.isUnicode() ? 2 : 1; + } + public int getCpMin() { return _cpMin; } @@ -439,7 +447,7 @@ public class TextPieceTable implements CharIndexTranslator { return textPlex.toByteArray(); } - private static class FCComparator implements Comparator<TextPiece>, Serializable { + protected static class FCComparator implements Comparator<TextPiece>, Serializable { public int compare(TextPiece textPiece, TextPiece textPiece1) { if (textPiece.getPieceDescriptor().fc > textPiece1 .getPieceDescriptor().fc) { diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java index 730133319c..5c2dc47490 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java @@ -18,6 +18,7 @@ package org.apache.poi.hwpf.usermodel; import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFOldDocument; import org.apache.poi.hwpf.model.CHPX; import org.apache.poi.hwpf.model.FFData; import org.apache.poi.hwpf.model.Ffn; @@ -438,6 +439,10 @@ public final class CharacterRun extends Range public String getFontName() { + if (_doc instanceof HWPFOldDocument) { + return ((HWPFOldDocument) _doc).getOldFontTable().getMainFont(_props.getFtcAscii()); + } + if (_doc.getFontTable() == null) // old word format return null; diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java index f3194bf2f8..5a3bc6e385 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java @@ -16,18 +16,19 @@ ==================================================================== */ package org.apache.poi.hwpf.converter; -import java.io.File; -import java.io.FilenameFilter; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import static org.junit.Assert.assertNotNull; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; +import java.io.File; +import java.io.FilenameFilter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import org.apache.poi.POIDataSamples; import org.apache.poi.hwpf.HWPFDocumentCore; @@ -36,8 +37,6 @@ import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.junit.Assert.assertNotNull; - @RunWith(Parameterized.class) public class TestWordToConverterSuite { @@ -45,7 +44,11 @@ public class TestWordToConverterSuite * YK: a quick hack to exclude failing documents from the suite. */ private static List<String> failingFiles = Arrays - .asList( "ProblemExtracting.doc" ); + .asList( "ProblemExtracting.doc", + "Bug50955.doc" //basic extraction works, + // but these extractors modify the document, + // which is a no-go for this Word 6.0 file + ); @Parameterized.Parameters(name="{index}: {0}") public static Iterable<Object[]> files() { diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java index b1e02f35c9..1ff7abd259 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java @@ -57,6 +57,7 @@ import junit.framework.TestCase; * against HWPF */ public class TestBugs{ + private static final POILogger logger = POILogFactory.getLogger(TestBugs.class); public static void assertEqualsIgnoreNewline(String expected, String actual ) @@ -536,13 +537,6 @@ public class TestBugs{ hwpfDocument.getPicturesTable().getAllPictures(); } - /** - * [FAILING] Bug 50955 - error while retrieving the text file - */ - @Test(expected=IllegalStateException.class) - public void test50955() throws IOException { - getTextOldFile("Bug50955.doc"); - } /** * [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java index 47017dbf7e..bfe22605ad 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java @@ -17,14 +17,19 @@ package org.apache.poi.hwpf.usermodel; +import static org.apache.poi.POITestCase.assertContains; import static org.junit.Assert.assertEquals; import java.io.IOException; +import java.nio.charset.Charset; import org.apache.poi.OldFileFormatException; +import org.apache.poi.hwmf.record.HwmfFont; import org.apache.poi.hwpf.HWPFOldDocument; import org.apache.poi.hwpf.HWPFTestCase; import org.apache.poi.hwpf.HWPFTestDataSamples; +import org.apache.poi.hwpf.extractor.Word6Extractor; +import org.apache.poi.hwpf.model.OldFontTable; import org.junit.Test; /** @@ -98,7 +103,7 @@ public final class TestHWPFOldDocument extends HWPFTestCase { assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns()); // Normal, superscript for 4th, normal assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns()); - + doc.close(); } @@ -143,4 +148,87 @@ public final class TestHWPFOldDocument extends HWPFTestCase { doc.getRange().getParagraph(1).text()); doc.close(); } + + @Test + public void testDefaultCodePageEncoding() throws IOException { + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc"); + Word6Extractor ex = new Word6Extractor(doc); + String txt = ex.getText(); + assertContains(txt, "BERTHOD"); + assertContains(txt, "APPLICOLOR"); + assertContains(txt, "les meilleurs"); + assertContains(txt, "GUY LECOLE"); + } + + + @Test + public void testCodePageBug50955() throws IOException { + //windows 1251 + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc"); + Word6Extractor ex = new Word6Extractor(doc); + + StringBuilder sb = new StringBuilder(); + for (String p : ex.getParagraphText()) { + sb.append(p); + } + assertContains(sb.toString(), "\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings! + } + + @Test + public void testCodePageBug60936() throws IOException { + //windows 1250 -- this test file was generated with OpenOffice + //see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the inspiration + + + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60936.doc"); + Word6Extractor ex = new Word6Extractor(doc); + StringBuilder sb = new StringBuilder(); + for (String p : ex.getParagraphText()) { + sb.append(p); + } + assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 lety");//Greetings! + } + + @Test + public void testOldFontTableEncoding() throws IOException { + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc"); + OldFontTable oldFontTable = doc.getOldFontTable(); + assertEquals(5, oldFontTable.getFontNames().length); + assertEquals("\u7D30\u660E\u9AD4", oldFontTable.getFontNames()[0].getMainFontName()); + assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), Charset.forName("Big5")); + assertEquals("Times New Roman", oldFontTable.getFontNames()[1].getMainFontName()); + doc.close(); + + } + + @Test + public void testOldFontTableAltName() throws IOException { + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc"); + OldFontTable oldFontTable = doc.getOldFontTable(); + assertEquals(5, oldFontTable.getFontNames().length); + assertEquals("Roboto", oldFontTable.getFontNames()[3].getMainFontName()); + assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName()); + assertEquals("Roboto", oldFontTable.getFontNames()[4].getMainFontName()); + assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName()); + } + + + @Test + public void test51944() throws IOException { + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc"); + Word6Extractor ex = new Word6Extractor(doc); + StringBuilder sb = new StringBuilder(); + for (String p : ex.getParagraphText()) { + sb.append(p.replaceAll("[\r\n]+", "\n")); + } + String txt = sb.toString(); + assertContains(txt, "Post and Fax"); + assertContains(txt, "also maintain");//this is at a critical juncture + assertContains(txt, "which are available for");//this too + + //TODO: figure out why these two aren't passing +// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly +// assertContains(txt, "We are able to");//not sure if we can get this easily? + } + } diff --git a/test-data/document/Bug60936.doc b/test-data/document/Bug60936.doc Binary files differnew file mode 100644 index 0000000000..e7e397d568 --- /dev/null +++ b/test-data/document/Bug60936.doc diff --git a/test-data/document/Bug60942.doc b/test-data/document/Bug60942.doc Binary files differnew file mode 100644 index 0000000000..fe64e67bc1 --- /dev/null +++ b/test-data/document/Bug60942.doc diff --git a/test-data/document/Bug60942b.doc b/test-data/document/Bug60942b.doc Binary files differnew file mode 100644 index 0000000000..7ca3b9839d --- /dev/null +++ b/test-data/document/Bug60942b.doc |