git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790061 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_16_FINAL
@@ -218,6 +218,9 @@ public class TestAllFiles { | |||
"document/Word6_sections2.doc", | |||
"document/Word95.doc", | |||
"document/word95err.doc", | |||
"document/Bug60936.doc", | |||
"document/Bug60942.doc", | |||
"document/Bug60942b.doc", | |||
"hpsf/TestMickey.doc", | |||
"document/52117.doc" | |||
); |
@@ -18,6 +18,9 @@ | |||
package org.apache.poi.util; | |||
import java.io.UnsupportedEncodingException; | |||
import java.nio.charset.Charset; | |||
import java.util.HashSet; | |||
import java.util.Set; | |||
/** | |||
* Utilities for working with Microsoft CodePages. | |||
@@ -27,6 +30,13 @@ import java.io.UnsupportedEncodingException; | |||
*/ | |||
public class CodePageUtil | |||
{ | |||
public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new HashSet<Charset>(); | |||
static { | |||
//others? | |||
VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5); | |||
} | |||
/** <p>Codepage 037, a special case</p> */ | |||
public static final int CP_037 = 37; | |||
@@ -0,0 +1,107 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.util; | |||
import java.io.ByteArrayInputStream; | |||
/** | |||
* Stream that converts MSOffice's way of storing Big5, with | |||
* zero-byte padding for ASCII and in LittleEndianOrder. | |||
*/ | |||
@Internal | |||
public class LittleEndianBig5Stream extends ByteArrayInputStream { | |||
private static final int EOF = -1; | |||
private static final int INVALID_PAIR = -2; | |||
private static final int EMPTY_TRAILING = -3; | |||
//the char that is logically trailing in Big5 encoding | |||
//however in LittleEndian order, this is the first encountered. | |||
int trailing = EMPTY_TRAILING; | |||
public LittleEndianBig5Stream(byte[] buf) { | |||
super(buf); | |||
} | |||
public LittleEndianBig5Stream(byte[] buf, int offset, int length) { | |||
super(buf, offset, length); | |||
} | |||
@Override | |||
public int read() { | |||
if (trailing != EMPTY_TRAILING) { | |||
int tmp = trailing; | |||
trailing = EMPTY_TRAILING; | |||
return tmp; | |||
} | |||
int leading = readNext(); | |||
while (leading == INVALID_PAIR) { | |||
leading = readNext(); | |||
} | |||
if (leading == EOF) { | |||
return EOF; | |||
} | |||
return leading; | |||
} | |||
//returns leading, sets trailing appropriately | |||
//returns -1 if it hits the end of the stream | |||
//returns -2 for an invalid big5 code pair | |||
private final int readNext() { | |||
trailing = super.read(); | |||
if (trailing == -1) { | |||
return EOF; | |||
} | |||
int leading = super.read(); | |||
if (leading == EOF) { | |||
return EOF; | |||
} | |||
int lead = leading&0xff; | |||
if (lead > 0x80) { | |||
return leading; | |||
} else if (lead == 0) { | |||
int ret = trailing; | |||
trailing = EMPTY_TRAILING; | |||
return ret; | |||
} else { | |||
int ret = trailing; | |||
trailing = EMPTY_TRAILING; | |||
return ret; | |||
//return INVALID_PAIR; | |||
} | |||
} | |||
@Override | |||
public int read(byte[] buff, int off, int len) { | |||
int bytesRead = 0; | |||
for (int i = off; i < off+len; i++) { | |||
int b = read(); | |||
if (b == -1) { | |||
if (bytesRead == 0) { | |||
return -1; | |||
} else { | |||
return bytesRead; | |||
} | |||
} | |||
bytesRead++; | |||
buff[i] = (byte)b; | |||
} | |||
return bytesRead; | |||
} | |||
} |
@@ -17,6 +17,8 @@ | |||
package org.apache.poi.util; | |||
import java.io.ByteArrayOutputStream; | |||
import java.io.IOException; | |||
import java.nio.charset.Charset; | |||
import java.util.HashMap; | |||
import java.util.Iterator; | |||
@@ -27,9 +29,14 @@ import java.util.Map; | |||
*/ | |||
@Internal | |||
public class StringUtil { | |||
private static final POILogger logger = POILogFactory | |||
.getLogger(StringUtil.class); | |||
protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1"); | |||
protected static final Charset UTF16LE = Charset.forName("UTF-16LE"); | |||
public static final Charset UTF16LE = Charset.forName("UTF-16LE"); | |||
public static final Charset UTF8 = Charset.forName("UTF-8"); | |||
public static final Charset WIN_1252 = Charset.forName("cp1252"); | |||
public static final Charset BIG5 = Charset.forName("Big5"); | |||
private static Map<Integer,Integer> msCodepointToUnicode; | |||
@@ -573,7 +580,28 @@ public class StringUtil { | |||
9133, // 0xf0fe bracerightbt | |||
' ', // 0xf0ff not defined | |||
}; | |||
/** | |||
* This tries to convert a LE byte array in Big5 to a String. | |||
* We know MS zero-padded ascii, and we drop those. | |||
* However, there may be areas for improvement in this. | |||
* | |||
* @param data | |||
* @param offset | |||
* @param lengthInBytes | |||
* @return | |||
*/ | |||
public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) { | |||
ByteArrayOutputStream os = new ByteArrayOutputStream(); | |||
try { | |||
IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os); | |||
} catch (IOException e) { | |||
logger.log(POILogger.WARN, | |||
"IOException while copying a byte array stream to a byte array stream?!"); | |||
} | |||
return new String(os.toByteArray(), BIG5); | |||
} | |||
// Could be replaced with org.apache.commons.lang3.StringUtils#join | |||
@Internal | |||
public static String join(Object[] array, String separator) { |
@@ -108,7 +108,7 @@ public class HwmfFont { | |||
return charset; | |||
} | |||
static WmfCharset valueOf(int flag) { | |||
public static WmfCharset valueOf(int flag) { | |||
for (WmfCharset cs : values()) { | |||
if (cs.flag == flag) return cs; | |||
} |
@@ -19,27 +19,43 @@ package org.apache.poi.hwpf; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import java.io.OutputStream; | |||
import java.nio.charset.Charset; | |||
import org.apache.poi.hwmf.record.HwmfFont; | |||
import org.apache.poi.hwpf.model.ComplexFileTable; | |||
import org.apache.poi.hwpf.model.FontTable; | |||
import org.apache.poi.hwpf.model.OldCHPBinTable; | |||
import org.apache.poi.hwpf.model.OldComplexFileTable; | |||
import org.apache.poi.hwpf.model.OldFfn; | |||
import org.apache.poi.hwpf.model.OldFontTable; | |||
import org.apache.poi.hwpf.model.OldPAPBinTable; | |||
import org.apache.poi.hwpf.model.OldSectionTable; | |||
import org.apache.poi.hwpf.model.OldTextPieceTable; | |||
import org.apache.poi.hwpf.model.PieceDescriptor; | |||
import org.apache.poi.hwpf.model.TextPiece; | |||
import org.apache.poi.hwpf.model.TextPieceTable; | |||
import org.apache.poi.hwpf.usermodel.Range; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.util.CodePageUtil; | |||
import org.apache.poi.util.LittleEndian; | |||
import org.apache.poi.util.NotImplemented; | |||
import org.apache.poi.util.StringUtil; | |||
/** | |||
* Provides very simple support for old (Word 6 / Word 95) | |||
* files. | |||
*/ | |||
public class HWPFOldDocument extends HWPFDocumentCore { | |||
private TextPieceTable tpt; | |||
private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252; | |||
private OldTextPieceTable tpt; | |||
private StringBuilder _text; | |||
private final OldFontTable fontTable; | |||
private final Charset guessedCharset; | |||
public HWPFOldDocument(POIFSFileSystem fs) throws IOException { | |||
this(fs.getRoot()); | |||
@@ -56,45 +72,52 @@ public class HWPFOldDocument extends HWPFDocumentCore { | |||
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc); | |||
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0); | |||
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4); | |||
//int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60); | |||
//int shfTableSize = LittleEndian.getInt(_mainStream, 0x64); | |||
int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0); | |||
int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4); | |||
fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize); | |||
//TODO: figure out how to map runs/text pieces to fonts | |||
//for now, if there's a non standard codepage in one of the fonts | |||
//assume that the doc is in that codepage. | |||
guessedCharset = guessCodePage(fontTable); | |||
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160); | |||
// We need to get hold of the text that makes up the | |||
// document, which might be regular or fast-saved | |||
ComplexFileTable cft = null; | |||
StringBuffer text = new StringBuffer(); | |||
if(_fib.getFibBase().isFComplex()) { | |||
cft = new ComplexFileTable( | |||
cft = new OldComplexFileTable( | |||
_mainStream, _mainStream, | |||
complexTableOffset, _fib.getFibBase().getFcMin() | |||
complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset | |||
); | |||
tpt = cft.getTextPieceTable(); | |||
tpt = (OldTextPieceTable)cft.getTextPieceTable(); | |||
for(TextPiece tp : tpt.getTextPieces()) { | |||
text.append( tp.getStringBuilder() ); | |||
} | |||
} else { | |||
// TODO Discover if these older documents can ever hold Unicode Strings? | |||
// (We think not, because they seem to lack a Piece table) | |||
// TODO Build the Piece Descriptor properly | |||
// (We have to fake it, as they don't seem to have a proper Piece table) | |||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0); | |||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset); | |||
pd.setFilePosition(_fib.getFibBase().getFcMin()); | |||
// Generate a single Text Piece Table, with a single Text Piece | |||
// which covers all the (8 bit only) text in the file | |||
tpt = new TextPieceTable(); | |||
tpt = new OldTextPieceTable(); | |||
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()]; | |||
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length); | |||
int numChars = textData.length; | |||
if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) { | |||
numChars /= 2; | |||
} | |||
TextPiece tp = new TextPiece( | |||
0, textData.length, textData, pd | |||
0, numChars, textData, pd | |||
); | |||
tpt.add(tp); | |||
text.append(tp.getStringBuilder()); | |||
} | |||
_text = tpt.getText(); | |||
// Now we can fetch the character and paragraph properties | |||
@@ -133,12 +156,54 @@ public class HWPFOldDocument extends HWPFDocumentCore { | |||
} | |||
} | |||
/** | |||
* Take the first codepage that is not default, ansi or symbol. | |||
* Ideally, we'd want to track fonts with runs, but we don't yet | |||
* know how to do that. | |||
* | |||
* Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi | |||
* appears here. | |||
* | |||
* @param fontTable | |||
* @return | |||
*/ | |||
private Charset guessCodePage(OldFontTable fontTable) { | |||
for (OldFfn oldFfn : fontTable.getFontNames()) { | |||
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff); | |||
if (wmfCharset != null && | |||
wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET && | |||
wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET && | |||
wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) { | |||
return wmfCharset.getCharset(); | |||
} | |||
} | |||
return DEFAULT_CHARSET; | |||
} | |||
public Range getOverallRange() | |||
{ | |||
// Life is easy when we have no footers, headers or unicode! | |||
return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this ); | |||
} | |||
/** | |||
* Use {@link #getOldFontTable()} instead!!! | |||
* This always throws an IllegalArgumentException. | |||
* | |||
* @return nothing | |||
* @throws UnsupportedOperationException | |||
*/ | |||
@Override | |||
@NotImplemented | |||
public FontTable getFontTable() { | |||
throw new UnsupportedOperationException("Use getOldFontTable instead."); | |||
} | |||
public OldFontTable getOldFontTable() { | |||
return fontTable; | |||
} | |||
public Range getRange() | |||
{ | |||
return getOverallRange(); | |||
@@ -167,4 +232,19 @@ public class HWPFOldDocument extends HWPFDocumentCore { | |||
public void write(OutputStream out) throws IOException { | |||
throw new IllegalStateException("Writing is not available for the older file formats"); | |||
} | |||
/** | |||
* As a rough heuristic (total hack), read through the font table | |||
* and take the first non-default, non-ansi, non-symbol | |||
* font's charset and return that. | |||
* | |||
* Once we figure out how to link a font to a text piece, we should | |||
* use the font information per text piece. | |||
* | |||
* @return charset | |||
*/ | |||
public Charset getGuessedCharset() { | |||
return guessedCharset; | |||
} | |||
} |
@@ -18,6 +18,7 @@ | |||
package org.apache.poi.hwpf.model; | |||
import java.io.IOException; | |||
import java.nio.charset.Charset; | |||
import java.util.LinkedList; | |||
import java.util.List; | |||
@@ -26,9 +27,10 @@ import org.apache.poi.hwpf.model.io.HWPFOutputStream; | |||
import org.apache.poi.hwpf.sprm.SprmBuffer; | |||
import org.apache.poi.util.Internal; | |||
import org.apache.poi.util.LittleEndian; | |||
import org.apache.poi.util.StringUtil; | |||
@Internal | |||
public final class ComplexFileTable { | |||
public class ComplexFileTable { | |||
private static final byte GRPPRL_TYPE = 1; | |||
private static final byte TEXT_PIECE_TABLE_TYPE = 2; | |||
@@ -40,7 +42,8 @@ public final class ComplexFileTable { | |||
_tpt = new TextPieceTable(); | |||
} | |||
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException { | |||
protected ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin, | |||
Charset charset) throws IOException { | |||
//skips through the prms before we reach the piece table. These contain data | |||
//for actual fast saved files | |||
List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>(); | |||
@@ -61,7 +64,12 @@ public final class ComplexFileTable { | |||
} | |||
int pieceTableSize = LittleEndian.getInt(tableStream, ++offset); | |||
offset += LittleEndian.INT_SIZE; | |||
_tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin); | |||
_tpt = newTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset); | |||
} | |||
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException { | |||
this(documentStream, tableStream, offset, fcMin, StringUtil.WIN_1252); | |||
} | |||
public TextPieceTable getTextPieceTable() { | |||
@@ -92,4 +100,11 @@ public final class ComplexFileTable { | |||
tableStream.write(table); | |||
} | |||
protected TextPieceTable newTextPieceTable(byte[] documentStream, | |||
byte[] tableStream, int offset, int pieceTableSize, int fcMin, | |||
Charset charset) { | |||
return new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin); | |||
} | |||
} |
@@ -44,7 +44,7 @@ public final class OldCHPBinTable extends CHPBinTable | |||
* @param fcMin | |||
*/ | |||
public OldCHPBinTable(byte[] documentStream, int offset, | |||
int size, int fcMin, TextPieceTable tpt) | |||
int size, int fcMin, OldTextPieceTable tpt) | |||
{ | |||
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); | |||
@@ -0,0 +1,42 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hwpf.model; | |||
import java.io.IOException; | |||
import java.nio.charset.Charset; | |||
import org.apache.poi.util.Internal; | |||
@Internal | |||
public final class OldComplexFileTable extends ComplexFileTable { | |||
public OldComplexFileTable(byte[] documentStream, byte[] tableStream, | |||
int offset, int fcMin, Charset charset) throws IOException { | |||
super(documentStream, tableStream, offset, fcMin, charset); | |||
} | |||
@Override | |||
protected TextPieceTable newTextPieceTable(byte[] documentStream, | |||
byte[] tableStream, int offset, | |||
int pieceTableSize, int fcMin, Charset charset) { | |||
return new OldTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset); | |||
} | |||
} |
@@ -0,0 +1,161 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hwpf.model; | |||
import java.nio.charset.Charset; | |||
import org.apache.poi.hwmf.record.HwmfFont; | |||
import org.apache.poi.util.Internal; | |||
import org.apache.poi.util.LittleEndian; | |||
import org.apache.poi.util.POILogFactory; | |||
import org.apache.poi.util.POILogger; | |||
import org.apache.poi.util.StringUtil; | |||
/** | |||
* Word 6.0 Font information | |||
*/ | |||
@Internal | |||
public final class OldFfn { | |||
private static final POILogger logger = POILogFactory.getLogger(OldFfn.class); | |||
private byte _chs;// character set identifier | |||
private final String fontName; | |||
private final String altFontName; | |||
private final int length; //length in bytes for this record | |||
/** | |||
* try to read an OldFfn starting at offset; read no farther than end | |||
* | |||
* @param buf buffer from which to read | |||
* @param offset offset at which to start | |||
* @param fontTableEnd read no farther than this | |||
* @return an OldFfn or null if asked to read beyond end | |||
*/ | |||
static OldFfn build(byte[] buf, int offset, int fontTableEnd) { | |||
int start = offset; | |||
//preliminary bytes | |||
if (offset + 6 > fontTableEnd) { | |||
return null; | |||
} | |||
//first byte | |||
short fontDescriptionLength = (short) buf[offset]; | |||
offset += 1; | |||
if (offset + fontDescriptionLength > fontTableEnd) { | |||
logger.log(POILogger.WARN, "Asked to read beyond font table end. Skipping font"); | |||
return null; | |||
} | |||
//no idea what these 3 bytes do | |||
offset += 3; | |||
byte chs = buf[offset]; | |||
Charset charset = null; | |||
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(chs & 0xff); | |||
if (wmfCharset == null) { | |||
logger.log(POILogger.WARN, "Couldn't find font for type: " + (chs & 0xff)); | |||
} else { | |||
charset = wmfCharset.getCharset(); | |||
} | |||
charset = charset == null ? StringUtil.WIN_1252 : charset; | |||
offset += LittleEndian.BYTE_SIZE; | |||
//if this byte here == 7, it _may_ signify existence of | |||
//an altername font name | |||
//not sure what the byte after the _chs does | |||
offset += LittleEndian.BYTE_SIZE; | |||
int fontNameLength = -1; | |||
for (int i = offset; i < fontTableEnd; i++) { | |||
if (buf[i] == 0) { | |||
fontNameLength = i - offset; | |||
break; | |||
} | |||
} | |||
if (fontNameLength == -1) { | |||
logger.log(POILogger.WARN, "Couldn't find the zero-byte delimited font name length"); | |||
return null; | |||
} | |||
String fontName = new String(buf, offset, fontNameLength, charset); | |||
String altFontName = null; | |||
int altFontNameLength = -1; | |||
offset += fontNameLength + 1; | |||
if (offset - start < fontDescriptionLength) { | |||
for (int i = offset; i <= start + fontDescriptionLength; i++) { | |||
if (buf[i] == 0) { | |||
altFontNameLength = i - offset; | |||
break; | |||
} | |||
} | |||
if (altFontNameLength > -1) { | |||
altFontName = new String(buf, offset, altFontNameLength, charset); | |||
} | |||
} | |||
//reset to 0 for length calculation | |||
altFontNameLength = (altFontNameLength < 0) ? 0 : altFontNameLength + 1;//add one for zero byte | |||
int len = LittleEndian.INT_SIZE + LittleEndian.BYTE_SIZE + LittleEndian.BYTE_SIZE +//6 starting bytes | |||
fontNameLength + altFontNameLength + 1;//+1 is for the zero byte | |||
//this len should == fontDescriptionLength | |||
return new OldFfn(chs, fontName, altFontName, len); | |||
} | |||
public OldFfn(byte charsetIdentifier, String fontName, String altFontName, int length) { | |||
this._chs = charsetIdentifier; | |||
this.fontName = fontName; | |||
this.altFontName = altFontName; | |||
this.length = length; | |||
} | |||
public byte getChs() { | |||
return _chs; | |||
} | |||
public String getMainFontName() { | |||
return fontName; | |||
} | |||
/** | |||
* @return altFontName if it exists, null otherwise | |||
*/ | |||
public String getAltFontName() { | |||
return altFontName; | |||
} | |||
/** | |||
* @return length in bytes for this record | |||
*/ | |||
public int getLength() { | |||
return length; | |||
} | |||
@Override | |||
public String toString() { | |||
return "OldFfn{" + | |||
"_chs=" + (_chs & 0xff) + | |||
", fontName='" + fontName + '\'' + | |||
", altFontName='" + altFontName + '\'' + | |||
", length=" + length + | |||
'}'; | |||
} | |||
} | |||
@@ -0,0 +1,84 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hwpf.model; | |||
import java.util.ArrayList; | |||
import java.util.Arrays; | |||
import java.util.List; | |||
import org.apache.poi.util.Internal; | |||
import org.apache.poi.util.LittleEndian; | |||
import org.apache.poi.util.POILogFactory; | |||
import org.apache.poi.util.POILogger; | |||
/** | |||
* Font table for Word 6.0 | |||
*/ | |||
@Internal | |||
public final class OldFontTable { | |||
private final static POILogger _logger = POILogFactory.getLogger(OldFontTable.class); | |||
// added extra facilitator members | |||
// FFN structure containing strings of font names | |||
private final OldFfn[] _fontNames; | |||
public OldFontTable(byte[] buf, int offset, int length) { | |||
//length is stored at the index section in the table | |||
//and it is recorded in the first short. | |||
List<OldFfn> ffns = new ArrayList<OldFfn>(); | |||
int fontTableLength = LittleEndian.getShort(buf, offset); | |||
int endOfTableOffset = offset + length; | |||
int startOffset = offset + LittleEndian.SHORT_SIZE;//first short should == length! | |||
while (true) { | |||
OldFfn oldFfn = OldFfn.build(buf, startOffset, endOfTableOffset); | |||
if (oldFfn == null) { | |||
break; | |||
} | |||
ffns.add(oldFfn); | |||
startOffset += oldFfn.getLength(); | |||
} | |||
_fontNames = ffns.toArray(new OldFfn[ffns.size()]); | |||
} | |||
public OldFfn[] getFontNames() { | |||
return _fontNames; | |||
} | |||
public String getMainFont(int chpFtc) { | |||
if (chpFtc >= _fontNames.length) { | |||
_logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount"); | |||
return null; | |||
} | |||
return _fontNames[chpFtc].getMainFontName(); | |||
} | |||
@Override | |||
public String toString() { | |||
return "OldFontTable{" + | |||
"_fontNames=" + Arrays.toString(_fontNames) + | |||
'}'; | |||
} | |||
} |
@@ -0,0 +1,120 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hwpf.model; | |||
import org.apache.poi.util.Internal; | |||
import org.apache.poi.util.NotImplemented; | |||
/** | |||
* Lightweight representation of a text piece. | |||
* Works in the character domain, not the byte domain, so you | |||
* need to have turned byte references into character | |||
* references before getting here. | |||
*/ | |||
@Internal | |||
public class OldTextPiece extends TextPiece { | |||
private final byte[] rawBytes; | |||
/** | |||
* @param start Beginning offset in main document stream, in characters. | |||
* @param end Ending offset in main document stream, in characters. | |||
* @param text The raw bytes of our text | |||
*/ | |||
public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) { | |||
super(start, end, text, pd); | |||
this.rawBytes = text; | |||
if (end < start) { | |||
throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end); | |||
} | |||
} | |||
/** | |||
* @return nothing, ever. Always throws an UnsupportedOperationException | |||
* @throws UnsupportedOperationException | |||
*/ | |||
@NotImplemented | |||
@Override | |||
public boolean isUnicode() { | |||
throw new UnsupportedOperationException(); | |||
} | |||
public StringBuilder getStringBuilder() { | |||
return (StringBuilder) _buf; | |||
} | |||
@Override | |||
public byte[] getRawBytes() { | |||
byte[] buf = new byte[rawBytes.length]; | |||
System.arraycopy(rawBytes, 0, buf, 0, rawBytes.length); | |||
return buf; | |||
} | |||
/** | |||
* Returns part of the string. | |||
* Works only in characters, not in bytes! | |||
* | |||
* @param start Local start position, in characters | |||
* @param end Local end position, in characters | |||
* @throws UnsupportedOperationException | |||
*/ | |||
@Deprecated | |||
@NotImplemented | |||
public String substring(int start, int end) { | |||
throw new UnsupportedOperationException(); | |||
} | |||
/** | |||
* Not implemented for OldTextPiece. | |||
* Always throws UnsupportedOperationException | |||
*/ | |||
@Deprecated | |||
@NotImplemented | |||
public void adjustForDelete(int start, int length) { | |||
throw new UnsupportedOperationException(); | |||
} | |||
/** | |||
* Returns the length, in bytes | |||
*/ | |||
public int bytesLength() { | |||
return rawBytes.length; | |||
} | |||
@Override | |||
public int hashCode() { | |||
assert false : "hashCode not designed"; | |||
return 42; // any arbitrary constant will do | |||
} | |||
/** | |||
* Returns the character position we start at. | |||
*/ | |||
public int getCP() { | |||
return getStart(); | |||
} | |||
public String toString() { | |||
return "OldTextPiece from " + getStart() + " to " + getEnd() + " (" | |||
+ getPieceDescriptor() + ")"; | |||
} | |||
} |
@@ -0,0 +1,119 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hwpf.model; | |||
import java.nio.charset.Charset; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import org.apache.poi.util.CodePageUtil; | |||
import org.apache.poi.util.Internal; | |||
import org.apache.poi.util.POILogFactory; | |||
import org.apache.poi.util.POILogger; | |||
@Internal | |||
public class OldTextPieceTable extends TextPieceTable { | |||
private static final POILogger logger = POILogFactory | |||
.getLogger(OldTextPieceTable.class); | |||
public OldTextPieceTable() { | |||
super(); | |||
} | |||
public OldTextPieceTable(byte[] documentStream, byte[] tableStream, | |||
int offset, int size, int fcMin, Charset charset) { | |||
//super(documentStream, tableStream, offset, size, fcMin, charset); | |||
// get our plex of PieceDescriptors | |||
PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, | |||
PieceDescriptor.getSizeInBytes()); | |||
int length = pieceTable.length(); | |||
PieceDescriptor[] pieces = new PieceDescriptor[length]; | |||
// iterate through piece descriptors raw bytes and create | |||
// PieceDescriptor objects | |||
for (int x = 0; x < length; x++) { | |||
GenericPropertyNode node = pieceTable.getProperty(x); | |||
pieces[x] = new PieceDescriptor(node.getBytes(), 0, charset); | |||
} | |||
// Figure out the cp of the earliest text piece | |||
// Note that text pieces don't have to be stored in order! | |||
_cpMin = pieces[0].getFilePosition() - fcMin; | |||
for (PieceDescriptor piece : pieces) { | |||
int start = piece.getFilePosition() - fcMin; | |||
if (start < _cpMin) { | |||
_cpMin = start; | |||
} | |||
} | |||
// using the PieceDescriptors, build our list of TextPieces. | |||
for (int x = 0; x < pieces.length; x++) { | |||
int start = pieces[x].getFilePosition(); | |||
GenericPropertyNode node = pieceTable.getProperty(x); | |||
// Grab the start and end, which are in characters | |||
int nodeStartChars = node.getStart(); | |||
int nodeEndChars = node.getEnd(); | |||
// What's the relationship between bytes and characters? | |||
boolean unicode = pieces[x].isUnicode(); | |||
int multiple = 1; | |||
if (unicode || | |||
(charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) { | |||
multiple = 2; | |||
} | |||
// Figure out the length, in bytes and chars | |||
int textSizeChars = (nodeEndChars - nodeStartChars); | |||
int textSizeBytes = textSizeChars * multiple; | |||
// Grab the data that makes up the piece | |||
byte[] buf = new byte[textSizeBytes]; | |||
System.arraycopy(documentStream, start, buf, 0, textSizeBytes); | |||
// And now build the piece | |||
final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf, | |||
pieces[x]); | |||
_textPieces.add(newTextPiece); | |||
} | |||
// In the interest of our sanity, now sort the text pieces | |||
// into order, if they're not already | |||
Collections.sort(_textPieces); | |||
_textPiecesFCOrder = new ArrayList<TextPiece>(_textPieces); | |||
Collections.sort(_textPiecesFCOrder, new FCComparator()); | |||
} | |||
@Override | |||
protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) { | |||
return new OldTextPiece(nodeStartChars, nodeEndChars, buf, pd); | |||
} | |||
@Override | |||
protected int getEncodingMultiplier(TextPiece textPiece) { | |||
Charset charset = textPiece.getPieceDescriptor().getCharset(); | |||
if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) { | |||
return 2; | |||
} | |||
return 1; | |||
} | |||
} |
@@ -260,7 +260,7 @@ public class PAPBinTable | |||
SprmBuffer sprmBuffer = null; | |||
for ( PAPX papx : papxs ) | |||
{ | |||
if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 ) | |||
if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 ) | |||
continue; | |||
if ( sprmBuffer == null ) { |
@@ -17,10 +17,13 @@ | |||
package org.apache.poi.hwpf.model; | |||
import java.nio.charset.Charset; | |||
import org.apache.poi.util.BitField; | |||
import org.apache.poi.util.BitFieldFactory; | |||
import org.apache.poi.util.Internal; | |||
import org.apache.poi.util.LittleEndian; | |||
import org.apache.poi.util.StringUtil; | |||
@Internal | |||
public final class PieceDescriptor | |||
@@ -32,29 +35,51 @@ public final class PieceDescriptor | |||
private static BitField fCopied = BitFieldFactory.getInstance(0x04); | |||
int fc; | |||
PropertyModifier prm; | |||
boolean unicode; | |||
boolean unicode = false; | |||
private final Charset charset; | |||
public PieceDescriptor(byte[] buf, int offset) | |||
{ | |||
descriptor = LittleEndian.getShort(buf, offset); | |||
offset += LittleEndian.SHORT_SIZE; | |||
fc = LittleEndian.getInt(buf, offset); | |||
offset += LittleEndian.INT_SIZE; | |||
prm = new PropertyModifier( LittleEndian.getShort(buf, offset)); | |||
// see if this piece uses unicode. | |||
if ((fc & 0x40000000) == 0) | |||
{ | |||
unicode = true; | |||
} | |||
else | |||
{ | |||
unicode = false; | |||
fc &= ~(0x40000000);//gives me FC in doc stream | |||
fc /= 2; | |||
public PieceDescriptor(byte[] buf, int offset) { | |||
this(buf, offset, null); | |||
} | |||
/** | |||
* | |||
* This initializer should only be used for HWPFOldDocuments. | |||
* | |||
* @param buf | |||
* @param offset | |||
* @param charset which charset to use if this is not unicode | |||
*/ | |||
public PieceDescriptor(byte[] buf, int offset, Charset charset) { | |||
descriptor = LittleEndian.getShort(buf, offset); | |||
offset += LittleEndian.SHORT_SIZE; | |||
fc = LittleEndian.getInt(buf, offset); | |||
offset += LittleEndian.INT_SIZE; | |||
prm = new PropertyModifier(LittleEndian.getShort(buf, offset)); | |||
if (charset == null) { | |||
// see if this piece uses unicode. | |||
//From the documentation: If the second most significant bit | |||
//is clear, then this indicates the actual file offset of the Unicode character (two bytes). If the | |||
//second most significant bit is set, then the actual address of the codepage-1252 | |||
//compressed version of the Unicode character (one byte), is actually at the offset indicated | |||
//by clearing this bit and dividing by two. | |||
if ((fc & 0x40000000) == 0) { | |||
unicode = true; | |||
this.charset = null; | |||
} else { | |||
unicode = false; | |||
fc &= ~(0x40000000);//gives me FC in doc stream | |||
fc /= 2; | |||
this.charset = StringUtil.WIN_1252; | |||
} | |||
} else { | |||
if (charset == StringUtil.UTF16LE) { | |||
unicode = true; | |||
} | |||
this.charset = charset; | |||
} | |||
} | |||
public int getFilePosition() | |||
@@ -72,6 +97,15 @@ public final class PieceDescriptor | |||
return unicode; | |||
} | |||
/** | |||
* | |||
* @return charset to use if this is not a Unicode PieceDescriptor | |||
* this can be <code>null</code> | |||
*/ | |||
public Charset getCharset() { | |||
return charset; | |||
} | |||
public PropertyModifier getPrm() | |||
{ | |||
return prm; |
@@ -21,6 +21,7 @@ package org.apache.poi.hwpf.model; | |||
import java.nio.charset.Charset; | |||
import org.apache.poi.util.Internal; | |||
import org.apache.poi.util.StringUtil; | |||
/** | |||
* Lightweight representation of a text piece. | |||
@@ -40,7 +41,6 @@ public class TextPiece extends PropertyNode<TextPiece> { | |||
* @param start Beginning offset in main document stream, in characters. | |||
* @param end Ending offset in main document stream, in characters. | |||
* @param text The raw bytes of our text | |||
* @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)} | |||
* instead | |||
*/ | |||
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, | |||
@@ -72,8 +72,13 @@ public class TextPiece extends PropertyNode<TextPiece> { | |||
* Create the StringBuilder from the text and unicode flag | |||
*/ | |||
private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) { | |||
String str = new String(text, Charset.forName(pd.isUnicode() ? "UTF-16LE" : "Cp1252")); | |||
byte[] textBuffer = text; | |||
if (StringUtil.BIG5.equals(pd.getCharset())) { | |||
String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString(); | |||
return new StringBuilder(txt); | |||
} | |||
String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset()); | |||
return new StringBuilder(str); | |||
} | |||
@@ -207,4 +212,5 @@ public class TextPiece extends PropertyNode<TextPiece> { | |||
return "TextPiece from " + getStart() + " to " + getEnd() + " (" | |||
+ getPieceDescriptor() + ")"; | |||
} | |||
} |
@@ -101,7 +101,7 @@ public class TextPieceTable implements CharIndexTranslator { | |||
System.arraycopy(documentStream, start, buf, 0, textSizeBytes); | |||
// And now build the piece | |||
final TextPiece newTextPiece = new TextPiece(nodeStartChars, nodeEndChars, buf, | |||
final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf, | |||
pieces[x]); | |||
_textPieces.add(newTextPiece); | |||
@@ -114,6 +114,10 @@ public class TextPieceTable implements CharIndexTranslator { | |||
Collections.sort(_textPiecesFCOrder, new FCComparator()); | |||
} | |||
protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) { | |||
return new TextPiece(nodeStartChars, nodeEndChars, buf, pd); | |||
} | |||
public void add(TextPiece piece) { | |||
_textPieces.add(piece); | |||
_textPiecesFCOrder.add(piece); | |||
@@ -249,7 +253,7 @@ public class TextPieceTable implements CharIndexTranslator { | |||
if (rangeStartBytes > rangeEndBytes) | |||
continue; | |||
final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1; | |||
final int encodingMultiplier = getEncodingMultiplier(textPiece); | |||
final int rangeStartCp = textPiece.getStart() | |||
+ (rangeStartBytes - tpStart) / encodingMultiplier; | |||
@@ -262,6 +266,10 @@ public class TextPieceTable implements CharIndexTranslator { | |||
return result.toArray(new int[result.size()][]); | |||
} | |||
protected int getEncodingMultiplier(TextPiece textPiece) { | |||
return textPiece.isUnicode() ? 2 : 1; | |||
} | |||
public int getCpMin() { | |||
return _cpMin; | |||
} | |||
@@ -439,7 +447,7 @@ public class TextPieceTable implements CharIndexTranslator { | |||
return textPlex.toByteArray(); | |||
} | |||
private static class FCComparator implements Comparator<TextPiece>, Serializable { | |||
protected static class FCComparator implements Comparator<TextPiece>, Serializable { | |||
public int compare(TextPiece textPiece, TextPiece textPiece1) { | |||
if (textPiece.getPieceDescriptor().fc > textPiece1 | |||
.getPieceDescriptor().fc) { |
@@ -18,6 +18,7 @@ | |||
package org.apache.poi.hwpf.usermodel; | |||
import org.apache.poi.hwpf.HWPFDocument; | |||
import org.apache.poi.hwpf.HWPFOldDocument; | |||
import org.apache.poi.hwpf.model.CHPX; | |||
import org.apache.poi.hwpf.model.FFData; | |||
import org.apache.poi.hwpf.model.Ffn; | |||
@@ -438,6 +439,10 @@ public final class CharacterRun extends Range | |||
public String getFontName() | |||
{ | |||
if (_doc instanceof HWPFOldDocument) { | |||
return ((HWPFOldDocument) _doc).getOldFontTable().getMainFont(_props.getFtcAscii()); | |||
} | |||
if (_doc.getFontTable() == null) | |||
// old word format | |||
return null; |
@@ -16,18 +16,19 @@ | |||
==================================================================== */ | |||
package org.apache.poi.hwpf.converter; | |||
import java.io.File; | |||
import java.io.FilenameFilter; | |||
import java.io.StringWriter; | |||
import java.util.ArrayList; | |||
import java.util.Arrays; | |||
import java.util.List; | |||
import static org.junit.Assert.assertNotNull; | |||
import javax.xml.transform.OutputKeys; | |||
import javax.xml.transform.Transformer; | |||
import javax.xml.transform.TransformerFactory; | |||
import javax.xml.transform.dom.DOMSource; | |||
import javax.xml.transform.stream.StreamResult; | |||
import java.io.File; | |||
import java.io.FilenameFilter; | |||
import java.io.StringWriter; | |||
import java.util.ArrayList; | |||
import java.util.Arrays; | |||
import java.util.List; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.hwpf.HWPFDocumentCore; | |||
@@ -36,8 +37,6 @@ import org.junit.Test; | |||
import org.junit.runner.RunWith; | |||
import org.junit.runners.Parameterized; | |||
import static org.junit.Assert.assertNotNull; | |||
@RunWith(Parameterized.class) | |||
public class TestWordToConverterSuite | |||
{ | |||
@@ -45,7 +44,11 @@ public class TestWordToConverterSuite | |||
* YK: a quick hack to exclude failing documents from the suite. | |||
*/ | |||
private static List<String> failingFiles = Arrays | |||
.asList( "ProblemExtracting.doc" ); | |||
.asList( "ProblemExtracting.doc", | |||
"Bug50955.doc" //basic extraction works, | |||
// but these extractors modify the document, | |||
// which is a no-go for this Word 6.0 file | |||
); | |||
@Parameterized.Parameters(name="{index}: {0}") | |||
public static Iterable<Object[]> files() { |
@@ -57,6 +57,7 @@ import junit.framework.TestCase; | |||
* against HWPF | |||
*/ | |||
public class TestBugs{ | |||
private static final POILogger logger = POILogFactory.getLogger(TestBugs.class); | |||
public static void assertEqualsIgnoreNewline(String expected, String actual ) | |||
@@ -536,13 +537,6 @@ public class TestBugs{ | |||
hwpfDocument.getPicturesTable().getAllPictures(); | |||
} | |||
/** | |||
* [FAILING] Bug 50955 - error while retrieving the text file | |||
*/ | |||
@Test(expected=IllegalStateException.class) | |||
public void test50955() throws IOException { | |||
getTextOldFile("Bug50955.doc"); | |||
} | |||
/** | |||
* [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta |
@@ -17,14 +17,19 @@ | |||
package org.apache.poi.hwpf.usermodel; | |||
import static org.apache.poi.POITestCase.assertContains; | |||
import static org.junit.Assert.assertEquals; | |||
import java.io.IOException; | |||
import java.nio.charset.Charset; | |||
import org.apache.poi.OldFileFormatException; | |||
import org.apache.poi.hwmf.record.HwmfFont; | |||
import org.apache.poi.hwpf.HWPFOldDocument; | |||
import org.apache.poi.hwpf.HWPFTestCase; | |||
import org.apache.poi.hwpf.HWPFTestDataSamples; | |||
import org.apache.poi.hwpf.extractor.Word6Extractor; | |||
import org.apache.poi.hwpf.model.OldFontTable; | |||
import org.junit.Test; | |||
/** | |||
@@ -98,7 +103,7 @@ public final class TestHWPFOldDocument extends HWPFTestCase { | |||
assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns()); | |||
// Normal, superscript for 4th, normal | |||
assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns()); | |||
doc.close(); | |||
} | |||
@@ -143,4 +148,87 @@ public final class TestHWPFOldDocument extends HWPFTestCase { | |||
doc.getRange().getParagraph(1).text()); | |||
doc.close(); | |||
} | |||
@Test | |||
public void testDefaultCodePageEncoding() throws IOException { | |||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc"); | |||
Word6Extractor ex = new Word6Extractor(doc); | |||
String txt = ex.getText(); | |||
assertContains(txt, "BERTHOD"); | |||
assertContains(txt, "APPLICOLOR"); | |||
assertContains(txt, "les meilleurs"); | |||
assertContains(txt, "GUY LECOLE"); | |||
} | |||
@Test | |||
public void testCodePageBug50955() throws IOException { | |||
//windows 1251 | |||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc"); | |||
Word6Extractor ex = new Word6Extractor(doc); | |||
StringBuilder sb = new StringBuilder(); | |||
for (String p : ex.getParagraphText()) { | |||
sb.append(p); | |||
} | |||
assertContains(sb.toString(), "\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings! | |||
} | |||
@Test | |||
public void testCodePageBug60936() throws IOException { | |||
//windows 1250 -- this test file was generated with OpenOffice | |||
//see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the inspiration | |||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60936.doc"); | |||
Word6Extractor ex = new Word6Extractor(doc); | |||
StringBuilder sb = new StringBuilder(); | |||
for (String p : ex.getParagraphText()) { | |||
sb.append(p); | |||
} | |||
assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 lety");//Greetings! | |||
} | |||
@Test | |||
public void testOldFontTableEncoding() throws IOException { | |||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc"); | |||
OldFontTable oldFontTable = doc.getOldFontTable(); | |||
assertEquals(5, oldFontTable.getFontNames().length); | |||
assertEquals("\u7D30\u660E\u9AD4", oldFontTable.getFontNames()[0].getMainFontName()); | |||
assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), Charset.forName("Big5")); | |||
assertEquals("Times New Roman", oldFontTable.getFontNames()[1].getMainFontName()); | |||
doc.close(); | |||
} | |||
@Test | |||
public void testOldFontTableAltName() throws IOException { | |||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc"); | |||
OldFontTable oldFontTable = doc.getOldFontTable(); | |||
assertEquals(5, oldFontTable.getFontNames().length); | |||
assertEquals("Roboto", oldFontTable.getFontNames()[3].getMainFontName()); | |||
assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName()); | |||
assertEquals("Roboto", oldFontTable.getFontNames()[4].getMainFontName()); | |||
assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName()); | |||
} | |||
@Test | |||
public void test51944() throws IOException { | |||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc"); | |||
Word6Extractor ex = new Word6Extractor(doc); | |||
StringBuilder sb = new StringBuilder(); | |||
for (String p : ex.getParagraphText()) { | |||
sb.append(p.replaceAll("[\r\n]+", "\n")); | |||
} | |||
String txt = sb.toString(); | |||
assertContains(txt, "Post and Fax"); | |||
assertContains(txt, "also maintain");//this is at a critical juncture | |||
assertContains(txt, "which are available for");//this too | |||
//TODO: figure out why these two aren't passing | |||
// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly | |||
// assertContains(txt, "We are able to");//not sure if we can get this easily? | |||
} | |||
} |