git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790061 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_16_FINAL
"document/Word6_sections2.doc", | "document/Word6_sections2.doc", | ||||
"document/Word95.doc", | "document/Word95.doc", | ||||
"document/word95err.doc", | "document/word95err.doc", | ||||
"document/Bug60936.doc", | |||||
"document/Bug60942.doc", | |||||
"document/Bug60942b.doc", | |||||
"hpsf/TestMickey.doc", | "hpsf/TestMickey.doc", | ||||
"document/52117.doc" | "document/52117.doc" | ||||
); | ); |
package org.apache.poi.util; | package org.apache.poi.util; | ||||
import java.io.UnsupportedEncodingException; | import java.io.UnsupportedEncodingException; | ||||
import java.nio.charset.Charset; | |||||
import java.util.HashSet; | |||||
import java.util.Set; | |||||
/** | /** | ||||
* Utilities for working with Microsoft CodePages. | * Utilities for working with Microsoft CodePages. | ||||
*/ | */ | ||||
public class CodePageUtil | public class CodePageUtil | ||||
{ | { | ||||
public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new HashSet<Charset>(); | |||||
static { | |||||
//others? | |||||
VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5); | |||||
} | |||||
/** <p>Codepage 037, a special case</p> */ | /** <p>Codepage 037, a special case</p> */ | ||||
public static final int CP_037 = 37; | public static final int CP_037 = 37; | ||||
/* ==================================================================== | |||||
Licensed to the Apache Software Foundation (ASF) under one or more | |||||
contributor license agreements. See the NOTICE file distributed with | |||||
this work for additional information regarding copyright ownership. | |||||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||||
(the "License"); you may not use this file except in compliance with | |||||
the License. You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
==================================================================== */ | |||||
package org.apache.poi.util; | |||||
import java.io.ByteArrayInputStream; | |||||
/** | |||||
* Stream that converts MSOffice's way of storing Big5, with | |||||
* zero-byte padding for ASCII and in LittleEndianOrder. | |||||
*/ | |||||
@Internal | |||||
public class LittleEndianBig5Stream extends ByteArrayInputStream { | |||||
private static final int EOF = -1; | |||||
private static final int INVALID_PAIR = -2; | |||||
private static final int EMPTY_TRAILING = -3; | |||||
//the char that is logically trailing in Big5 encoding | |||||
//however in LittleEndian order, this is the first encountered. | |||||
int trailing = EMPTY_TRAILING; | |||||
public LittleEndianBig5Stream(byte[] buf) { | |||||
super(buf); | |||||
} | |||||
public LittleEndianBig5Stream(byte[] buf, int offset, int length) { | |||||
super(buf, offset, length); | |||||
} | |||||
@Override | |||||
public int read() { | |||||
if (trailing != EMPTY_TRAILING) { | |||||
int tmp = trailing; | |||||
trailing = EMPTY_TRAILING; | |||||
return tmp; | |||||
} | |||||
int leading = readNext(); | |||||
while (leading == INVALID_PAIR) { | |||||
leading = readNext(); | |||||
} | |||||
if (leading == EOF) { | |||||
return EOF; | |||||
} | |||||
return leading; | |||||
} | |||||
//returns leading, sets trailing appropriately | |||||
//returns -1 if it hits the end of the stream | |||||
//returns -2 for an invalid big5 code pair | |||||
private final int readNext() { | |||||
trailing = super.read(); | |||||
if (trailing == -1) { | |||||
return EOF; | |||||
} | |||||
int leading = super.read(); | |||||
if (leading == EOF) { | |||||
return EOF; | |||||
} | |||||
int lead = leading&0xff; | |||||
if (lead > 0x80) { | |||||
return leading; | |||||
} else if (lead == 0) { | |||||
int ret = trailing; | |||||
trailing = EMPTY_TRAILING; | |||||
return ret; | |||||
} else { | |||||
int ret = trailing; | |||||
trailing = EMPTY_TRAILING; | |||||
return ret; | |||||
//return INVALID_PAIR; | |||||
} | |||||
} | |||||
@Override | |||||
public int read(byte[] buff, int off, int len) { | |||||
int bytesRead = 0; | |||||
for (int i = off; i < off+len; i++) { | |||||
int b = read(); | |||||
if (b == -1) { | |||||
if (bytesRead == 0) { | |||||
return -1; | |||||
} else { | |||||
return bytesRead; | |||||
} | |||||
} | |||||
bytesRead++; | |||||
buff[i] = (byte)b; | |||||
} | |||||
return bytesRead; | |||||
} | |||||
} |
package org.apache.poi.util; | package org.apache.poi.util; | ||||
import java.io.ByteArrayOutputStream; | |||||
import java.io.IOException; | |||||
import java.nio.charset.Charset; | import java.nio.charset.Charset; | ||||
import java.util.HashMap; | import java.util.HashMap; | ||||
import java.util.Iterator; | import java.util.Iterator; | ||||
*/ | */ | ||||
@Internal | @Internal | ||||
public class StringUtil { | public class StringUtil { | ||||
private static final POILogger logger = POILogFactory | |||||
.getLogger(StringUtil.class); | |||||
protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1"); | protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1"); | ||||
protected static final Charset UTF16LE = Charset.forName("UTF-16LE"); | |||||
public static final Charset UTF16LE = Charset.forName("UTF-16LE"); | |||||
public static final Charset UTF8 = Charset.forName("UTF-8"); | public static final Charset UTF8 = Charset.forName("UTF-8"); | ||||
public static final Charset WIN_1252 = Charset.forName("cp1252"); | |||||
public static final Charset BIG5 = Charset.forName("Big5"); | |||||
private static Map<Integer,Integer> msCodepointToUnicode; | private static Map<Integer,Integer> msCodepointToUnicode; | ||||
9133, // 0xf0fe bracerightbt | 9133, // 0xf0fe bracerightbt | ||||
' ', // 0xf0ff not defined | ' ', // 0xf0ff not defined | ||||
}; | }; | ||||
/** | |||||
* This tries to convert a LE byte array in Big5 to a String. | |||||
* We know MS zero-padded ascii, and we drop those. | |||||
* However, there may be areas for improvement in this. | |||||
* | |||||
* @param data | |||||
* @param offset | |||||
* @param lengthInBytes | |||||
* @return | |||||
*/ | |||||
public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) { | |||||
ByteArrayOutputStream os = new ByteArrayOutputStream(); | |||||
try { | |||||
IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os); | |||||
} catch (IOException e) { | |||||
logger.log(POILogger.WARN, | |||||
"IOException while copying a byte array stream to a byte array stream?!"); | |||||
} | |||||
return new String(os.toByteArray(), BIG5); | |||||
} | |||||
// Could be replaced with org.apache.commons.lang3.StringUtils#join | // Could be replaced with org.apache.commons.lang3.StringUtils#join | ||||
@Internal | @Internal | ||||
public static String join(Object[] array, String separator) { | public static String join(Object[] array, String separator) { |
return charset; | return charset; | ||||
} | } | ||||
static WmfCharset valueOf(int flag) { | |||||
public static WmfCharset valueOf(int flag) { | |||||
for (WmfCharset cs : values()) { | for (WmfCharset cs : values()) { | ||||
if (cs.flag == flag) return cs; | if (cs.flag == flag) return cs; | ||||
} | } |
import java.io.File; | import java.io.File; | ||||
import java.io.IOException; | import java.io.IOException; | ||||
import java.io.OutputStream; | import java.io.OutputStream; | ||||
import java.nio.charset.Charset; | |||||
import org.apache.poi.hwmf.record.HwmfFont; | |||||
import org.apache.poi.hwpf.model.ComplexFileTable; | import org.apache.poi.hwpf.model.ComplexFileTable; | ||||
import org.apache.poi.hwpf.model.FontTable; | |||||
import org.apache.poi.hwpf.model.OldCHPBinTable; | import org.apache.poi.hwpf.model.OldCHPBinTable; | ||||
import org.apache.poi.hwpf.model.OldComplexFileTable; | |||||
import org.apache.poi.hwpf.model.OldFfn; | |||||
import org.apache.poi.hwpf.model.OldFontTable; | |||||
import org.apache.poi.hwpf.model.OldPAPBinTable; | import org.apache.poi.hwpf.model.OldPAPBinTable; | ||||
import org.apache.poi.hwpf.model.OldSectionTable; | import org.apache.poi.hwpf.model.OldSectionTable; | ||||
import org.apache.poi.hwpf.model.OldTextPieceTable; | |||||
import org.apache.poi.hwpf.model.PieceDescriptor; | import org.apache.poi.hwpf.model.PieceDescriptor; | ||||
import org.apache.poi.hwpf.model.TextPiece; | import org.apache.poi.hwpf.model.TextPiece; | ||||
import org.apache.poi.hwpf.model.TextPieceTable; | import org.apache.poi.hwpf.model.TextPieceTable; | ||||
import org.apache.poi.hwpf.usermodel.Range; | import org.apache.poi.hwpf.usermodel.Range; | ||||
import org.apache.poi.poifs.filesystem.DirectoryNode; | import org.apache.poi.poifs.filesystem.DirectoryNode; | ||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | import org.apache.poi.poifs.filesystem.POIFSFileSystem; | ||||
import org.apache.poi.util.CodePageUtil; | |||||
import org.apache.poi.util.LittleEndian; | import org.apache.poi.util.LittleEndian; | ||||
import org.apache.poi.util.NotImplemented; | |||||
import org.apache.poi.util.StringUtil; | |||||
/** | /** | ||||
* Provides very simple support for old (Word 6 / Word 95) | * Provides very simple support for old (Word 6 / Word 95) | ||||
* files. | * files. | ||||
*/ | */ | ||||
public class HWPFOldDocument extends HWPFDocumentCore { | public class HWPFOldDocument extends HWPFDocumentCore { | ||||
private TextPieceTable tpt; | |||||
private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252; | |||||
private OldTextPieceTable tpt; | |||||
private StringBuilder _text; | private StringBuilder _text; | ||||
private final OldFontTable fontTable; | |||||
private final Charset guessedCharset; | |||||
public HWPFOldDocument(POIFSFileSystem fs) throws IOException { | public HWPFOldDocument(POIFSFileSystem fs) throws IOException { | ||||
this(fs.getRoot()); | this(fs.getRoot()); | ||||
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc); | int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc); | ||||
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0); | int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0); | ||||
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4); | int papTableSize = LittleEndian.getInt(_mainStream, 0xc4); | ||||
//int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60); | |||||
//int shfTableSize = LittleEndian.getInt(_mainStream, 0x64); | |||||
int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0); | |||||
int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4); | |||||
fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize); | |||||
//TODO: figure out how to map runs/text pieces to fonts | |||||
//for now, if there's a non standard codepage in one of the fonts | |||||
//assume that the doc is in that codepage. | |||||
guessedCharset = guessCodePage(fontTable); | |||||
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160); | int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160); | ||||
// We need to get hold of the text that makes up the | // We need to get hold of the text that makes up the | ||||
// document, which might be regular or fast-saved | // document, which might be regular or fast-saved | ||||
ComplexFileTable cft = null; | ComplexFileTable cft = null; | ||||
StringBuffer text = new StringBuffer(); | |||||
if(_fib.getFibBase().isFComplex()) { | if(_fib.getFibBase().isFComplex()) { | ||||
cft = new ComplexFileTable( | |||||
cft = new OldComplexFileTable( | |||||
_mainStream, _mainStream, | _mainStream, _mainStream, | ||||
complexTableOffset, _fib.getFibBase().getFcMin() | |||||
complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset | |||||
); | ); | ||||
tpt = cft.getTextPieceTable(); | |||||
tpt = (OldTextPieceTable)cft.getTextPieceTable(); | |||||
for(TextPiece tp : tpt.getTextPieces()) { | |||||
text.append( tp.getStringBuilder() ); | |||||
} | |||||
} else { | } else { | ||||
// TODO Discover if these older documents can ever hold Unicode Strings? | // TODO Discover if these older documents can ever hold Unicode Strings? | ||||
// (We think not, because they seem to lack a Piece table) | // (We think not, because they seem to lack a Piece table) | ||||
// TODO Build the Piece Descriptor properly | // TODO Build the Piece Descriptor properly | ||||
// (We have to fake it, as they don't seem to have a proper Piece table) | // (We have to fake it, as they don't seem to have a proper Piece table) | ||||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0); | |||||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset); | |||||
pd.setFilePosition(_fib.getFibBase().getFcMin()); | pd.setFilePosition(_fib.getFibBase().getFcMin()); | ||||
// Generate a single Text Piece Table, with a single Text Piece | // Generate a single Text Piece Table, with a single Text Piece | ||||
// which covers all the (8 bit only) text in the file | // which covers all the (8 bit only) text in the file | ||||
tpt = new TextPieceTable(); | |||||
tpt = new OldTextPieceTable(); | |||||
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()]; | byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()]; | ||||
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length); | System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length); | ||||
int numChars = textData.length; | |||||
if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) { | |||||
numChars /= 2; | |||||
} | |||||
TextPiece tp = new TextPiece( | TextPiece tp = new TextPiece( | ||||
0, textData.length, textData, pd | |||||
0, numChars, textData, pd | |||||
); | ); | ||||
tpt.add(tp); | tpt.add(tp); | ||||
text.append(tp.getStringBuilder()); | |||||
} | } | ||||
_text = tpt.getText(); | _text = tpt.getText(); | ||||
// Now we can fetch the character and paragraph properties | // Now we can fetch the character and paragraph properties | ||||
} | } | ||||
} | } | ||||
/** | |||||
* Take the first codepage that is not default, ansi or symbol. | |||||
* Ideally, we'd want to track fonts with runs, but we don't yet | |||||
* know how to do that. | |||||
* | |||||
* Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi | |||||
* appears here. | |||||
* | |||||
* @param fontTable | |||||
* @return | |||||
*/ | |||||
private Charset guessCodePage(OldFontTable fontTable) { | |||||
for (OldFfn oldFfn : fontTable.getFontNames()) { | |||||
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff); | |||||
if (wmfCharset != null && | |||||
wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET && | |||||
wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET && | |||||
wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) { | |||||
return wmfCharset.getCharset(); | |||||
} | |||||
} | |||||
return DEFAULT_CHARSET; | |||||
} | |||||
public Range getOverallRange() | public Range getOverallRange() | ||||
{ | { | ||||
// Life is easy when we have no footers, headers or unicode! | // Life is easy when we have no footers, headers or unicode! | ||||
return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this ); | return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this ); | ||||
} | } | ||||
/** | |||||
* Use {@link #getOldFontTable()} instead!!! | |||||
* This always throws an IllegalArgumentException. | |||||
* | |||||
* @return nothing | |||||
* @throws UnsupportedOperationException | |||||
*/ | |||||
@Override | |||||
@NotImplemented | |||||
public FontTable getFontTable() { | |||||
throw new UnsupportedOperationException("Use getOldFontTable instead."); | |||||
} | |||||
public OldFontTable getOldFontTable() { | |||||
return fontTable; | |||||
} | |||||
public Range getRange() | public Range getRange() | ||||
{ | { | ||||
return getOverallRange(); | return getOverallRange(); | ||||
public void write(OutputStream out) throws IOException { | public void write(OutputStream out) throws IOException { | ||||
throw new IllegalStateException("Writing is not available for the older file formats"); | throw new IllegalStateException("Writing is not available for the older file formats"); | ||||
} | } | ||||
/** | |||||
* As a rough heuristic (total hack), read through the font table | |||||
* and take the first non-default, non-ansi, non-symbol | |||||
* font's charset and return that. | |||||
* | |||||
* Once we figure out how to link a font to a text piece, we should | |||||
* use the font information per text piece. | |||||
* | |||||
* @return charset | |||||
*/ | |||||
public Charset getGuessedCharset() { | |||||
return guessedCharset; | |||||
} | |||||
} | } |
package org.apache.poi.hwpf.model; | package org.apache.poi.hwpf.model; | ||||
import java.io.IOException; | import java.io.IOException; | ||||
import java.nio.charset.Charset; | |||||
import java.util.LinkedList; | import java.util.LinkedList; | ||||
import java.util.List; | import java.util.List; | ||||
import org.apache.poi.hwpf.sprm.SprmBuffer; | import org.apache.poi.hwpf.sprm.SprmBuffer; | ||||
import org.apache.poi.util.Internal; | import org.apache.poi.util.Internal; | ||||
import org.apache.poi.util.LittleEndian; | import org.apache.poi.util.LittleEndian; | ||||
import org.apache.poi.util.StringUtil; | |||||
@Internal | @Internal | ||||
public final class ComplexFileTable { | |||||
public class ComplexFileTable { | |||||
private static final byte GRPPRL_TYPE = 1; | private static final byte GRPPRL_TYPE = 1; | ||||
private static final byte TEXT_PIECE_TABLE_TYPE = 2; | private static final byte TEXT_PIECE_TABLE_TYPE = 2; | ||||
_tpt = new TextPieceTable(); | _tpt = new TextPieceTable(); | ||||
} | } | ||||
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException { | |||||
protected ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin, | |||||
Charset charset) throws IOException { | |||||
//skips through the prms before we reach the piece table. These contain data | //skips through the prms before we reach the piece table. These contain data | ||||
//for actual fast saved files | //for actual fast saved files | ||||
List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>(); | List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>(); | ||||
} | } | ||||
int pieceTableSize = LittleEndian.getInt(tableStream, ++offset); | int pieceTableSize = LittleEndian.getInt(tableStream, ++offset); | ||||
offset += LittleEndian.INT_SIZE; | offset += LittleEndian.INT_SIZE; | ||||
_tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin); | |||||
_tpt = newTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset); | |||||
} | |||||
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException { | |||||
this(documentStream, tableStream, offset, fcMin, StringUtil.WIN_1252); | |||||
} | } | ||||
public TextPieceTable getTextPieceTable() { | public TextPieceTable getTextPieceTable() { | ||||
tableStream.write(table); | tableStream.write(table); | ||||
} | } | ||||
protected TextPieceTable newTextPieceTable(byte[] documentStream, | |||||
byte[] tableStream, int offset, int pieceTableSize, int fcMin, | |||||
Charset charset) { | |||||
return new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin); | |||||
} | |||||
} | } |
* @param fcMin | * @param fcMin | ||||
*/ | */ | ||||
public OldCHPBinTable(byte[] documentStream, int offset, | public OldCHPBinTable(byte[] documentStream, int offset, | ||||
int size, int fcMin, TextPieceTable tpt) | |||||
int size, int fcMin, OldTextPieceTable tpt) | |||||
{ | { | ||||
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); | PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); | ||||
/* ==================================================================== | |||||
Licensed to the Apache Software Foundation (ASF) under one or more | |||||
contributor license agreements. See the NOTICE file distributed with | |||||
this work for additional information regarding copyright ownership. | |||||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||||
(the "License"); you may not use this file except in compliance with | |||||
the License. You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
==================================================================== */ | |||||
package org.apache.poi.hwpf.model; | |||||
import java.io.IOException; | |||||
import java.nio.charset.Charset; | |||||
import org.apache.poi.util.Internal; | |||||
@Internal | |||||
public final class OldComplexFileTable extends ComplexFileTable { | |||||
public OldComplexFileTable(byte[] documentStream, byte[] tableStream, | |||||
int offset, int fcMin, Charset charset) throws IOException { | |||||
super(documentStream, tableStream, offset, fcMin, charset); | |||||
} | |||||
@Override | |||||
protected TextPieceTable newTextPieceTable(byte[] documentStream, | |||||
byte[] tableStream, int offset, | |||||
int pieceTableSize, int fcMin, Charset charset) { | |||||
return new OldTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset); | |||||
} | |||||
} |
/* ==================================================================== | |||||
Licensed to the Apache Software Foundation (ASF) under one or more | |||||
contributor license agreements. See the NOTICE file distributed with | |||||
this work for additional information regarding copyright ownership. | |||||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||||
(the "License"); you may not use this file except in compliance with | |||||
the License. You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
==================================================================== */ | |||||
package org.apache.poi.hwpf.model; | |||||
import java.nio.charset.Charset; | |||||
import org.apache.poi.hwmf.record.HwmfFont; | |||||
import org.apache.poi.util.Internal; | |||||
import org.apache.poi.util.LittleEndian; | |||||
import org.apache.poi.util.POILogFactory; | |||||
import org.apache.poi.util.POILogger; | |||||
import org.apache.poi.util.StringUtil; | |||||
/** | |||||
* Word 6.0 Font information | |||||
*/ | |||||
@Internal | |||||
public final class OldFfn { | |||||
private static final POILogger logger = POILogFactory.getLogger(OldFfn.class); | |||||
private byte _chs;// character set identifier | |||||
private final String fontName; | |||||
private final String altFontName; | |||||
private final int length; //length in bytes for this record | |||||
/** | |||||
* try to read an OldFfn starting at offset; read no farther than end | |||||
* | |||||
* @param buf buffer from which to read | |||||
* @param offset offset at which to start | |||||
* @param fontTableEnd read no farther than this | |||||
* @return an OldFfn or null if asked to read beyond end | |||||
*/ | |||||
static OldFfn build(byte[] buf, int offset, int fontTableEnd) { | |||||
int start = offset; | |||||
//preliminary bytes | |||||
if (offset + 6 > fontTableEnd) { | |||||
return null; | |||||
} | |||||
//first byte | |||||
short fontDescriptionLength = (short) buf[offset]; | |||||
offset += 1; | |||||
if (offset + fontDescriptionLength > fontTableEnd) { | |||||
logger.log(POILogger.WARN, "Asked to read beyond font table end. Skipping font"); | |||||
return null; | |||||
} | |||||
//no idea what these 3 bytes do | |||||
offset += 3; | |||||
byte chs = buf[offset]; | |||||
Charset charset = null; | |||||
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(chs & 0xff); | |||||
if (wmfCharset == null) { | |||||
logger.log(POILogger.WARN, "Couldn't find font for type: " + (chs & 0xff)); | |||||
} else { | |||||
charset = wmfCharset.getCharset(); | |||||
} | |||||
charset = charset == null ? StringUtil.WIN_1252 : charset; | |||||
offset += LittleEndian.BYTE_SIZE; | |||||
//if this byte here == 7, it _may_ signify existence of | |||||
//an altername font name | |||||
//not sure what the byte after the _chs does | |||||
offset += LittleEndian.BYTE_SIZE; | |||||
int fontNameLength = -1; | |||||
for (int i = offset; i < fontTableEnd; i++) { | |||||
if (buf[i] == 0) { | |||||
fontNameLength = i - offset; | |||||
break; | |||||
} | |||||
} | |||||
if (fontNameLength == -1) { | |||||
logger.log(POILogger.WARN, "Couldn't find the zero-byte delimited font name length"); | |||||
return null; | |||||
} | |||||
String fontName = new String(buf, offset, fontNameLength, charset); | |||||
String altFontName = null; | |||||
int altFontNameLength = -1; | |||||
offset += fontNameLength + 1; | |||||
if (offset - start < fontDescriptionLength) { | |||||
for (int i = offset; i <= start + fontDescriptionLength; i++) { | |||||
if (buf[i] == 0) { | |||||
altFontNameLength = i - offset; | |||||
break; | |||||
} | |||||
} | |||||
if (altFontNameLength > -1) { | |||||
altFontName = new String(buf, offset, altFontNameLength, charset); | |||||
} | |||||
} | |||||
//reset to 0 for length calculation | |||||
altFontNameLength = (altFontNameLength < 0) ? 0 : altFontNameLength + 1;//add one for zero byte | |||||
int len = LittleEndian.INT_SIZE + LittleEndian.BYTE_SIZE + LittleEndian.BYTE_SIZE +//6 starting bytes | |||||
fontNameLength + altFontNameLength + 1;//+1 is for the zero byte | |||||
//this len should == fontDescriptionLength | |||||
return new OldFfn(chs, fontName, altFontName, len); | |||||
} | |||||
public OldFfn(byte charsetIdentifier, String fontName, String altFontName, int length) { | |||||
this._chs = charsetIdentifier; | |||||
this.fontName = fontName; | |||||
this.altFontName = altFontName; | |||||
this.length = length; | |||||
} | |||||
public byte getChs() { | |||||
return _chs; | |||||
} | |||||
public String getMainFontName() { | |||||
return fontName; | |||||
} | |||||
/** | |||||
* @return altFontName if it exists, null otherwise | |||||
*/ | |||||
public String getAltFontName() { | |||||
return altFontName; | |||||
} | |||||
/** | |||||
* @return length in bytes for this record | |||||
*/ | |||||
public int getLength() { | |||||
return length; | |||||
} | |||||
@Override | |||||
public String toString() { | |||||
return "OldFfn{" + | |||||
"_chs=" + (_chs & 0xff) + | |||||
", fontName='" + fontName + '\'' + | |||||
", altFontName='" + altFontName + '\'' + | |||||
", length=" + length + | |||||
'}'; | |||||
} | |||||
} | |||||
/* ==================================================================== | |||||
Licensed to the Apache Software Foundation (ASF) under one or more | |||||
contributor license agreements. See the NOTICE file distributed with | |||||
this work for additional information regarding copyright ownership. | |||||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||||
(the "License"); you may not use this file except in compliance with | |||||
the License. You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
==================================================================== */ | |||||
package org.apache.poi.hwpf.model; | |||||
import java.util.ArrayList; | |||||
import java.util.Arrays; | |||||
import java.util.List; | |||||
import org.apache.poi.util.Internal; | |||||
import org.apache.poi.util.LittleEndian; | |||||
import org.apache.poi.util.POILogFactory; | |||||
import org.apache.poi.util.POILogger; | |||||
/** | |||||
* Font table for Word 6.0 | |||||
*/ | |||||
@Internal | |||||
public final class OldFontTable { | |||||
private final static POILogger _logger = POILogFactory.getLogger(OldFontTable.class); | |||||
// added extra facilitator members | |||||
// FFN structure containing strings of font names | |||||
private final OldFfn[] _fontNames; | |||||
public OldFontTable(byte[] buf, int offset, int length) { | |||||
//length is stored at the index section in the table | |||||
//and it is recorded in the first short. | |||||
List<OldFfn> ffns = new ArrayList<OldFfn>(); | |||||
int fontTableLength = LittleEndian.getShort(buf, offset); | |||||
int endOfTableOffset = offset + length; | |||||
int startOffset = offset + LittleEndian.SHORT_SIZE;//first short should == length! | |||||
while (true) { | |||||
OldFfn oldFfn = OldFfn.build(buf, startOffset, endOfTableOffset); | |||||
if (oldFfn == null) { | |||||
break; | |||||
} | |||||
ffns.add(oldFfn); | |||||
startOffset += oldFfn.getLength(); | |||||
} | |||||
_fontNames = ffns.toArray(new OldFfn[ffns.size()]); | |||||
} | |||||
public OldFfn[] getFontNames() { | |||||
return _fontNames; | |||||
} | |||||
public String getMainFont(int chpFtc) { | |||||
if (chpFtc >= _fontNames.length) { | |||||
_logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount"); | |||||
return null; | |||||
} | |||||
return _fontNames[chpFtc].getMainFontName(); | |||||
} | |||||
@Override | |||||
public String toString() { | |||||
return "OldFontTable{" + | |||||
"_fontNames=" + Arrays.toString(_fontNames) + | |||||
'}'; | |||||
} | |||||
} |
/* ==================================================================== | |||||
Licensed to the Apache Software Foundation (ASF) under one or more | |||||
contributor license agreements. See the NOTICE file distributed with | |||||
this work for additional information regarding copyright ownership. | |||||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||||
(the "License"); you may not use this file except in compliance with | |||||
the License. You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
==================================================================== */ | |||||
package org.apache.poi.hwpf.model; | |||||
import org.apache.poi.util.Internal; | |||||
import org.apache.poi.util.NotImplemented; | |||||
/** | |||||
* Lightweight representation of a text piece. | |||||
* Works in the character domain, not the byte domain, so you | |||||
* need to have turned byte references into character | |||||
* references before getting here. | |||||
*/ | |||||
@Internal | |||||
public class OldTextPiece extends TextPiece { | |||||
private final byte[] rawBytes; | |||||
/** | |||||
* @param start Beginning offset in main document stream, in characters. | |||||
* @param end Ending offset in main document stream, in characters. | |||||
* @param text The raw bytes of our text | |||||
*/ | |||||
public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) { | |||||
super(start, end, text, pd); | |||||
this.rawBytes = text; | |||||
if (end < start) { | |||||
throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end); | |||||
} | |||||
} | |||||
/** | |||||
* @return nothing, ever. Always throws an UnsupportedOperationException | |||||
* @throws UnsupportedOperationException | |||||
*/ | |||||
@NotImplemented | |||||
@Override | |||||
public boolean isUnicode() { | |||||
throw new UnsupportedOperationException(); | |||||
} | |||||
public StringBuilder getStringBuilder() { | |||||
return (StringBuilder) _buf; | |||||
} | |||||
@Override | |||||
public byte[] getRawBytes() { | |||||
byte[] buf = new byte[rawBytes.length]; | |||||
System.arraycopy(rawBytes, 0, buf, 0, rawBytes.length); | |||||
return buf; | |||||
} | |||||
/** | |||||
* Returns part of the string. | |||||
* Works only in characters, not in bytes! | |||||
* | |||||
* @param start Local start position, in characters | |||||
* @param end Local end position, in characters | |||||
* @throws UnsupportedOperationException | |||||
*/ | |||||
@Deprecated | |||||
@NotImplemented | |||||
public String substring(int start, int end) { | |||||
throw new UnsupportedOperationException(); | |||||
} | |||||
/** | |||||
* Not implemented for OldTextPiece. | |||||
* Always throws UnsupportedOperationException | |||||
*/ | |||||
@Deprecated | |||||
@NotImplemented | |||||
public void adjustForDelete(int start, int length) { | |||||
throw new UnsupportedOperationException(); | |||||
} | |||||
/** | |||||
* Returns the length, in bytes | |||||
*/ | |||||
public int bytesLength() { | |||||
return rawBytes.length; | |||||
} | |||||
@Override | |||||
public int hashCode() { | |||||
assert false : "hashCode not designed"; | |||||
return 42; // any arbitrary constant will do | |||||
} | |||||
/** | |||||
* Returns the character position we start at. | |||||
*/ | |||||
public int getCP() { | |||||
return getStart(); | |||||
} | |||||
public String toString() { | |||||
return "OldTextPiece from " + getStart() + " to " + getEnd() + " (" | |||||
+ getPieceDescriptor() + ")"; | |||||
} | |||||
} |
/* ==================================================================== | |||||
Licensed to the Apache Software Foundation (ASF) under one or more | |||||
contributor license agreements. See the NOTICE file distributed with | |||||
this work for additional information regarding copyright ownership. | |||||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||||
(the "License"); you may not use this file except in compliance with | |||||
the License. You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
==================================================================== */ | |||||
package org.apache.poi.hwpf.model; | |||||
import java.nio.charset.Charset; | |||||
import java.util.ArrayList; | |||||
import java.util.Collections; | |||||
import org.apache.poi.util.CodePageUtil; | |||||
import org.apache.poi.util.Internal; | |||||
import org.apache.poi.util.POILogFactory; | |||||
import org.apache.poi.util.POILogger; | |||||
@Internal | |||||
public class OldTextPieceTable extends TextPieceTable { | |||||
private static final POILogger logger = POILogFactory | |||||
.getLogger(OldTextPieceTable.class); | |||||
public OldTextPieceTable() { | |||||
super(); | |||||
} | |||||
public OldTextPieceTable(byte[] documentStream, byte[] tableStream, | |||||
int offset, int size, int fcMin, Charset charset) { | |||||
//super(documentStream, tableStream, offset, size, fcMin, charset); | |||||
// get our plex of PieceDescriptors | |||||
PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, | |||||
PieceDescriptor.getSizeInBytes()); | |||||
int length = pieceTable.length(); | |||||
PieceDescriptor[] pieces = new PieceDescriptor[length]; | |||||
// iterate through piece descriptors raw bytes and create | |||||
// PieceDescriptor objects | |||||
for (int x = 0; x < length; x++) { | |||||
GenericPropertyNode node = pieceTable.getProperty(x); | |||||
pieces[x] = new PieceDescriptor(node.getBytes(), 0, charset); | |||||
} | |||||
// Figure out the cp of the earliest text piece | |||||
// Note that text pieces don't have to be stored in order! | |||||
_cpMin = pieces[0].getFilePosition() - fcMin; | |||||
for (PieceDescriptor piece : pieces) { | |||||
int start = piece.getFilePosition() - fcMin; | |||||
if (start < _cpMin) { | |||||
_cpMin = start; | |||||
} | |||||
} | |||||
// using the PieceDescriptors, build our list of TextPieces. | |||||
for (int x = 0; x < pieces.length; x++) { | |||||
int start = pieces[x].getFilePosition(); | |||||
GenericPropertyNode node = pieceTable.getProperty(x); | |||||
// Grab the start and end, which are in characters | |||||
int nodeStartChars = node.getStart(); | |||||
int nodeEndChars = node.getEnd(); | |||||
// What's the relationship between bytes and characters? | |||||
boolean unicode = pieces[x].isUnicode(); | |||||
int multiple = 1; | |||||
if (unicode || | |||||
(charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) { | |||||
multiple = 2; | |||||
} | |||||
// Figure out the length, in bytes and chars | |||||
int textSizeChars = (nodeEndChars - nodeStartChars); | |||||
int textSizeBytes = textSizeChars * multiple; | |||||
// Grab the data that makes up the piece | |||||
byte[] buf = new byte[textSizeBytes]; | |||||
System.arraycopy(documentStream, start, buf, 0, textSizeBytes); | |||||
// And now build the piece | |||||
final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf, | |||||
pieces[x]); | |||||
_textPieces.add(newTextPiece); | |||||
} | |||||
// In the interest of our sanity, now sort the text pieces | |||||
// into order, if they're not already | |||||
Collections.sort(_textPieces); | |||||
_textPiecesFCOrder = new ArrayList<TextPiece>(_textPieces); | |||||
Collections.sort(_textPiecesFCOrder, new FCComparator()); | |||||
} | |||||
@Override | |||||
protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) { | |||||
return new OldTextPiece(nodeStartChars, nodeEndChars, buf, pd); | |||||
} | |||||
@Override | |||||
protected int getEncodingMultiplier(TextPiece textPiece) { | |||||
Charset charset = textPiece.getPieceDescriptor().getCharset(); | |||||
if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) { | |||||
return 2; | |||||
} | |||||
return 1; | |||||
} | |||||
} |
SprmBuffer sprmBuffer = null; | SprmBuffer sprmBuffer = null; | ||||
for ( PAPX papx : papxs ) | for ( PAPX papx : papxs ) | ||||
{ | { | ||||
if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 ) | |||||
if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 ) | |||||
continue; | continue; | ||||
if ( sprmBuffer == null ) { | if ( sprmBuffer == null ) { |
package org.apache.poi.hwpf.model; | package org.apache.poi.hwpf.model; | ||||
import java.nio.charset.Charset; | |||||
import org.apache.poi.util.BitField; | import org.apache.poi.util.BitField; | ||||
import org.apache.poi.util.BitFieldFactory; | import org.apache.poi.util.BitFieldFactory; | ||||
import org.apache.poi.util.Internal; | import org.apache.poi.util.Internal; | ||||
import org.apache.poi.util.LittleEndian; | import org.apache.poi.util.LittleEndian; | ||||
import org.apache.poi.util.StringUtil; | |||||
@Internal | @Internal | ||||
public final class PieceDescriptor | public final class PieceDescriptor | ||||
private static BitField fCopied = BitFieldFactory.getInstance(0x04); | private static BitField fCopied = BitFieldFactory.getInstance(0x04); | ||||
int fc; | int fc; | ||||
PropertyModifier prm; | PropertyModifier prm; | ||||
boolean unicode; | |||||
boolean unicode = false; | |||||
private final Charset charset; | |||||
public PieceDescriptor(byte[] buf, int offset) | |||||
{ | |||||
descriptor = LittleEndian.getShort(buf, offset); | |||||
offset += LittleEndian.SHORT_SIZE; | |||||
fc = LittleEndian.getInt(buf, offset); | |||||
offset += LittleEndian.INT_SIZE; | |||||
prm = new PropertyModifier( LittleEndian.getShort(buf, offset)); | |||||
// see if this piece uses unicode. | |||||
if ((fc & 0x40000000) == 0) | |||||
{ | |||||
unicode = true; | |||||
} | |||||
else | |||||
{ | |||||
unicode = false; | |||||
fc &= ~(0x40000000);//gives me FC in doc stream | |||||
fc /= 2; | |||||
public PieceDescriptor(byte[] buf, int offset) { | |||||
this(buf, offset, null); | |||||
} | } | ||||
/** | |||||
* | |||||
* This initializer should only be used for HWPFOldDocuments. | |||||
* | |||||
* @param buf | |||||
* @param offset | |||||
* @param charset which charset to use if this is not unicode | |||||
*/ | |||||
public PieceDescriptor(byte[] buf, int offset, Charset charset) { | |||||
descriptor = LittleEndian.getShort(buf, offset); | |||||
offset += LittleEndian.SHORT_SIZE; | |||||
fc = LittleEndian.getInt(buf, offset); | |||||
offset += LittleEndian.INT_SIZE; | |||||
prm = new PropertyModifier(LittleEndian.getShort(buf, offset)); | |||||
if (charset == null) { | |||||
// see if this piece uses unicode. | |||||
//From the documentation: If the second most significant bit | |||||
//is clear, then this indicates the actual file offset of the Unicode character (two bytes). If the | |||||
//second most significant bit is set, then the actual address of the codepage-1252 | |||||
//compressed version of the Unicode character (one byte), is actually at the offset indicated | |||||
//by clearing this bit and dividing by two. | |||||
if ((fc & 0x40000000) == 0) { | |||||
unicode = true; | |||||
this.charset = null; | |||||
} else { | |||||
unicode = false; | |||||
fc &= ~(0x40000000);//gives me FC in doc stream | |||||
fc /= 2; | |||||
this.charset = StringUtil.WIN_1252; | |||||
} | |||||
} else { | |||||
if (charset == StringUtil.UTF16LE) { | |||||
unicode = true; | |||||
} | |||||
this.charset = charset; | |||||
} | |||||
} | } | ||||
public int getFilePosition() | public int getFilePosition() | ||||
return unicode; | return unicode; | ||||
} | } | ||||
/** | |||||
* | |||||
* @return charset to use if this is not a Unicode PieceDescriptor | |||||
* this can be <code>null</code> | |||||
*/ | |||||
public Charset getCharset() { | |||||
return charset; | |||||
} | |||||
public PropertyModifier getPrm() | public PropertyModifier getPrm() | ||||
{ | { | ||||
return prm; | return prm; |
import java.nio.charset.Charset; | import java.nio.charset.Charset; | ||||
import org.apache.poi.util.Internal; | import org.apache.poi.util.Internal; | ||||
import org.apache.poi.util.StringUtil; | |||||
/** | /** | ||||
* Lightweight representation of a text piece. | * Lightweight representation of a text piece. | ||||
* @param start Beginning offset in main document stream, in characters. | * @param start Beginning offset in main document stream, in characters. | ||||
* @param end Ending offset in main document stream, in characters. | * @param end Ending offset in main document stream, in characters. | ||||
* @param text The raw bytes of our text | * @param text The raw bytes of our text | ||||
* @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)} | |||||
* instead | * instead | ||||
*/ | */ | ||||
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, | public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, | ||||
* Create the StringBuilder from the text and unicode flag | * Create the StringBuilder from the text and unicode flag | ||||
*/ | */ | ||||
private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) { | private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) { | ||||
String str = new String(text, Charset.forName(pd.isUnicode() ? "UTF-16LE" : "Cp1252")); | |||||
byte[] textBuffer = text; | |||||
if (StringUtil.BIG5.equals(pd.getCharset())) { | |||||
String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString(); | |||||
return new StringBuilder(txt); | |||||
} | |||||
String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset()); | |||||
return new StringBuilder(str); | return new StringBuilder(str); | ||||
} | } | ||||
return "TextPiece from " + getStart() + " to " + getEnd() + " (" | return "TextPiece from " + getStart() + " to " + getEnd() + " (" | ||||
+ getPieceDescriptor() + ")"; | + getPieceDescriptor() + ")"; | ||||
} | } | ||||
} | } |
System.arraycopy(documentStream, start, buf, 0, textSizeBytes); | System.arraycopy(documentStream, start, buf, 0, textSizeBytes); | ||||
// And now build the piece | // And now build the piece | ||||
final TextPiece newTextPiece = new TextPiece(nodeStartChars, nodeEndChars, buf, | |||||
final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf, | |||||
pieces[x]); | pieces[x]); | ||||
_textPieces.add(newTextPiece); | _textPieces.add(newTextPiece); | ||||
Collections.sort(_textPiecesFCOrder, new FCComparator()); | Collections.sort(_textPiecesFCOrder, new FCComparator()); | ||||
} | } | ||||
protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) { | |||||
return new TextPiece(nodeStartChars, nodeEndChars, buf, pd); | |||||
} | |||||
public void add(TextPiece piece) { | public void add(TextPiece piece) { | ||||
_textPieces.add(piece); | _textPieces.add(piece); | ||||
_textPiecesFCOrder.add(piece); | _textPiecesFCOrder.add(piece); | ||||
if (rangeStartBytes > rangeEndBytes) | if (rangeStartBytes > rangeEndBytes) | ||||
continue; | continue; | ||||
final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1; | |||||
final int encodingMultiplier = getEncodingMultiplier(textPiece); | |||||
final int rangeStartCp = textPiece.getStart() | final int rangeStartCp = textPiece.getStart() | ||||
+ (rangeStartBytes - tpStart) / encodingMultiplier; | + (rangeStartBytes - tpStart) / encodingMultiplier; | ||||
return result.toArray(new int[result.size()][]); | return result.toArray(new int[result.size()][]); | ||||
} | } | ||||
protected int getEncodingMultiplier(TextPiece textPiece) { | |||||
return textPiece.isUnicode() ? 2 : 1; | |||||
} | |||||
public int getCpMin() { | public int getCpMin() { | ||||
return _cpMin; | return _cpMin; | ||||
} | } | ||||
return textPlex.toByteArray(); | return textPlex.toByteArray(); | ||||
} | } | ||||
private static class FCComparator implements Comparator<TextPiece>, Serializable { | |||||
protected static class FCComparator implements Comparator<TextPiece>, Serializable { | |||||
public int compare(TextPiece textPiece, TextPiece textPiece1) { | public int compare(TextPiece textPiece, TextPiece textPiece1) { | ||||
if (textPiece.getPieceDescriptor().fc > textPiece1 | if (textPiece.getPieceDescriptor().fc > textPiece1 | ||||
.getPieceDescriptor().fc) { | .getPieceDescriptor().fc) { |
package org.apache.poi.hwpf.usermodel; | package org.apache.poi.hwpf.usermodel; | ||||
import org.apache.poi.hwpf.HWPFDocument; | import org.apache.poi.hwpf.HWPFDocument; | ||||
import org.apache.poi.hwpf.HWPFOldDocument; | |||||
import org.apache.poi.hwpf.model.CHPX; | import org.apache.poi.hwpf.model.CHPX; | ||||
import org.apache.poi.hwpf.model.FFData; | import org.apache.poi.hwpf.model.FFData; | ||||
import org.apache.poi.hwpf.model.Ffn; | import org.apache.poi.hwpf.model.Ffn; | ||||
public String getFontName() | public String getFontName() | ||||
{ | { | ||||
if (_doc instanceof HWPFOldDocument) { | |||||
return ((HWPFOldDocument) _doc).getOldFontTable().getMainFont(_props.getFtcAscii()); | |||||
} | |||||
if (_doc.getFontTable() == null) | if (_doc.getFontTable() == null) | ||||
// old word format | // old word format | ||||
return null; | return null; |
==================================================================== */ | ==================================================================== */ | ||||
package org.apache.poi.hwpf.converter; | package org.apache.poi.hwpf.converter; | ||||
import java.io.File; | |||||
import java.io.FilenameFilter; | |||||
import java.io.StringWriter; | |||||
import java.util.ArrayList; | |||||
import java.util.Arrays; | |||||
import java.util.List; | |||||
import static org.junit.Assert.assertNotNull; | |||||
import javax.xml.transform.OutputKeys; | import javax.xml.transform.OutputKeys; | ||||
import javax.xml.transform.Transformer; | import javax.xml.transform.Transformer; | ||||
import javax.xml.transform.TransformerFactory; | import javax.xml.transform.TransformerFactory; | ||||
import javax.xml.transform.dom.DOMSource; | import javax.xml.transform.dom.DOMSource; | ||||
import javax.xml.transform.stream.StreamResult; | import javax.xml.transform.stream.StreamResult; | ||||
import java.io.File; | |||||
import java.io.FilenameFilter; | |||||
import java.io.StringWriter; | |||||
import java.util.ArrayList; | |||||
import java.util.Arrays; | |||||
import java.util.List; | |||||
import org.apache.poi.POIDataSamples; | import org.apache.poi.POIDataSamples; | ||||
import org.apache.poi.hwpf.HWPFDocumentCore; | import org.apache.poi.hwpf.HWPFDocumentCore; | ||||
import org.junit.runner.RunWith; | import org.junit.runner.RunWith; | ||||
import org.junit.runners.Parameterized; | import org.junit.runners.Parameterized; | ||||
import static org.junit.Assert.assertNotNull; | |||||
@RunWith(Parameterized.class) | @RunWith(Parameterized.class) | ||||
public class TestWordToConverterSuite | public class TestWordToConverterSuite | ||||
{ | { | ||||
* YK: a quick hack to exclude failing documents from the suite. | * YK: a quick hack to exclude failing documents from the suite. | ||||
*/ | */ | ||||
private static List<String> failingFiles = Arrays | private static List<String> failingFiles = Arrays | ||||
.asList( "ProblemExtracting.doc" ); | |||||
.asList( "ProblemExtracting.doc", | |||||
"Bug50955.doc" //basic extraction works, | |||||
// but these extractors modify the document, | |||||
// which is a no-go for this Word 6.0 file | |||||
); | |||||
@Parameterized.Parameters(name="{index}: {0}") | @Parameterized.Parameters(name="{index}: {0}") | ||||
public static Iterable<Object[]> files() { | public static Iterable<Object[]> files() { |
* against HWPF | * against HWPF | ||||
*/ | */ | ||||
public class TestBugs{ | public class TestBugs{ | ||||
private static final POILogger logger = POILogFactory.getLogger(TestBugs.class); | private static final POILogger logger = POILogFactory.getLogger(TestBugs.class); | ||||
public static void assertEqualsIgnoreNewline(String expected, String actual ) | public static void assertEqualsIgnoreNewline(String expected, String actual ) | ||||
hwpfDocument.getPicturesTable().getAllPictures(); | hwpfDocument.getPicturesTable().getAllPictures(); | ||||
} | } | ||||
/** | |||||
* [FAILING] Bug 50955 - error while retrieving the text file | |||||
*/ | |||||
@Test(expected=IllegalStateException.class) | |||||
public void test50955() throws IOException { | |||||
getTextOldFile("Bug50955.doc"); | |||||
} | |||||
/** | /** | ||||
* [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta | * [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta |
package org.apache.poi.hwpf.usermodel; | package org.apache.poi.hwpf.usermodel; | ||||
import static org.apache.poi.POITestCase.assertContains; | |||||
import static org.junit.Assert.assertEquals; | import static org.junit.Assert.assertEquals; | ||||
import java.io.IOException; | import java.io.IOException; | ||||
import java.nio.charset.Charset; | |||||
import org.apache.poi.OldFileFormatException; | import org.apache.poi.OldFileFormatException; | ||||
import org.apache.poi.hwmf.record.HwmfFont; | |||||
import org.apache.poi.hwpf.HWPFOldDocument; | import org.apache.poi.hwpf.HWPFOldDocument; | ||||
import org.apache.poi.hwpf.HWPFTestCase; | import org.apache.poi.hwpf.HWPFTestCase; | ||||
import org.apache.poi.hwpf.HWPFTestDataSamples; | import org.apache.poi.hwpf.HWPFTestDataSamples; | ||||
import org.apache.poi.hwpf.extractor.Word6Extractor; | |||||
import org.apache.poi.hwpf.model.OldFontTable; | |||||
import org.junit.Test; | import org.junit.Test; | ||||
/** | /** | ||||
assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns()); | assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns()); | ||||
// Normal, superscript for 4th, normal | // Normal, superscript for 4th, normal | ||||
assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns()); | assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns()); | ||||
doc.close(); | doc.close(); | ||||
} | } | ||||
doc.getRange().getParagraph(1).text()); | doc.getRange().getParagraph(1).text()); | ||||
doc.close(); | doc.close(); | ||||
} | } | ||||
@Test | |||||
public void testDefaultCodePageEncoding() throws IOException { | |||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc"); | |||||
Word6Extractor ex = new Word6Extractor(doc); | |||||
String txt = ex.getText(); | |||||
assertContains(txt, "BERTHOD"); | |||||
assertContains(txt, "APPLICOLOR"); | |||||
assertContains(txt, "les meilleurs"); | |||||
assertContains(txt, "GUY LECOLE"); | |||||
} | |||||
@Test | |||||
public void testCodePageBug50955() throws IOException { | |||||
//windows 1251 | |||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc"); | |||||
Word6Extractor ex = new Word6Extractor(doc); | |||||
StringBuilder sb = new StringBuilder(); | |||||
for (String p : ex.getParagraphText()) { | |||||
sb.append(p); | |||||
} | |||||
assertContains(sb.toString(), "\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings! | |||||
} | |||||
@Test | |||||
public void testCodePageBug60936() throws IOException { | |||||
//windows 1250 -- this test file was generated with OpenOffice | |||||
//see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the inspiration | |||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60936.doc"); | |||||
Word6Extractor ex = new Word6Extractor(doc); | |||||
StringBuilder sb = new StringBuilder(); | |||||
for (String p : ex.getParagraphText()) { | |||||
sb.append(p); | |||||
} | |||||
assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 lety");//Greetings! | |||||
} | |||||
@Test | |||||
public void testOldFontTableEncoding() throws IOException { | |||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc"); | |||||
OldFontTable oldFontTable = doc.getOldFontTable(); | |||||
assertEquals(5, oldFontTable.getFontNames().length); | |||||
assertEquals("\u7D30\u660E\u9AD4", oldFontTable.getFontNames()[0].getMainFontName()); | |||||
assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), Charset.forName("Big5")); | |||||
assertEquals("Times New Roman", oldFontTable.getFontNames()[1].getMainFontName()); | |||||
doc.close(); | |||||
} | |||||
@Test | |||||
public void testOldFontTableAltName() throws IOException { | |||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc"); | |||||
OldFontTable oldFontTable = doc.getOldFontTable(); | |||||
assertEquals(5, oldFontTable.getFontNames().length); | |||||
assertEquals("Roboto", oldFontTable.getFontNames()[3].getMainFontName()); | |||||
assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName()); | |||||
assertEquals("Roboto", oldFontTable.getFontNames()[4].getMainFontName()); | |||||
assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName()); | |||||
} | |||||
@Test | |||||
public void test51944() throws IOException { | |||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc"); | |||||
Word6Extractor ex = new Word6Extractor(doc); | |||||
StringBuilder sb = new StringBuilder(); | |||||
for (String p : ex.getParagraphText()) { | |||||
sb.append(p.replaceAll("[\r\n]+", "\n")); | |||||
} | |||||
String txt = sb.toString(); | |||||
assertContains(txt, "Post and Fax"); | |||||
assertContains(txt, "also maintain");//this is at a critical juncture | |||||
assertContains(txt, "which are available for");//this too | |||||
//TODO: figure out why these two aren't passing | |||||
// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly | |||||
// assertContains(txt, "We are able to");//not sure if we can get this easily? | |||||
} | |||||
} | } |